Compare commits
	
		
			4 Commits
		
	
	
		
			gromacs_ma
			...
			superclust
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 
						 | 
					924914e4f0 | ||
| 
						 | 
					055a009dbd | ||
| 
						 | 
					182c065fe2 | ||
| 
						 | 
					ee3f6de050 | 
							
								
								
									
										23
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										23
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							@@ -51,17 +51,14 @@ Module.symvers
 | 
			
		||||
Mkfile.old
 | 
			
		||||
dkms.conf
 | 
			
		||||
 | 
			
		||||
# Logs
 | 
			
		||||
*.log
 | 
			
		||||
 | 
			
		||||
# TODO list
 | 
			
		||||
todo.txt
 | 
			
		||||
 | 
			
		||||
# Build directories and executables
 | 
			
		||||
#GCC-*/
 | 
			
		||||
#ICC-*/
 | 
			
		||||
#ICX-*/
 | 
			
		||||
#CLANG-*/
 | 
			
		||||
#NVCC-*/
 | 
			
		||||
build-*/
 | 
			
		||||
MDBench-*
 | 
			
		||||
GCC/
 | 
			
		||||
ICC/
 | 
			
		||||
ICX/
 | 
			
		||||
CLANG/
 | 
			
		||||
NVCC/
 | 
			
		||||
MDBench-GCC*
 | 
			
		||||
MDBench-ICC*
 | 
			
		||||
MDBench-ICX*
 | 
			
		||||
MDBench-CLANG*
 | 
			
		||||
MDBench-NVCC*
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										16
									
								
								Makefile
									
									
									
									
									
								
							
							
						
						
									
										16
									
								
								Makefile
									
									
									
									
									
								
							@@ -1,7 +1,6 @@
 | 
			
		||||
#CONFIGURE BUILD SYSTEM
 | 
			
		||||
IDENTIFIER = $(OPT_SCHEME)-$(TAG)-$(ISA)-$(DATA_TYPE)
 | 
			
		||||
TARGET	   = MDBench-$(IDENTIFIER)
 | 
			
		||||
BUILD_DIR  = ./build-$(IDENTIFIER)
 | 
			
		||||
TARGET	   = MDBench-$(TAG)-$(OPT_SCHEME)
 | 
			
		||||
BUILD_DIR  = ./$(TAG)-$(OPT_SCHEME)
 | 
			
		||||
SRC_DIR    = ./$(OPT_SCHEME)
 | 
			
		||||
ASM_DIR    = ./asm
 | 
			
		||||
COMMON_DIR = ./common
 | 
			
		||||
@@ -98,6 +97,10 @@ ifeq ($(strip $(USE_SIMD_KERNEL)),true)
 | 
			
		||||
    DEFINES += -DUSE_SIMD_KERNEL
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
ifeq ($(strip $(USE_SUPER_CLUSTERS)),true)
 | 
			
		||||
    DEFINES += -DUSE_SUPER_CLUSTERS
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
VPATH     = $(SRC_DIR) $(ASM_DIR) $(CUDA_DIR)
 | 
			
		||||
ASM       = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
 | 
			
		||||
OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s))
 | 
			
		||||
@@ -152,13 +155,6 @@ $(BUILD_DIR)/%.o:  %.s
 | 
			
		||||
clean:
 | 
			
		||||
	$(info ===>  CLEAN)
 | 
			
		||||
	@rm -rf $(BUILD_DIR)
 | 
			
		||||
	@rm -rf MDBench-$(IDENTIFIER)
 | 
			
		||||
	@rm -f tags
 | 
			
		||||
 | 
			
		||||
cleanall:
 | 
			
		||||
	$(info ===>  CLEAN)
 | 
			
		||||
	@rm -rf build-*
 | 
			
		||||
	@rm -rf MDBench-*
 | 
			
		||||
	@rm -f tags
 | 
			
		||||
 | 
			
		||||
distclean: clean
 | 
			
		||||
 
 | 
			
		||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -8,11 +8,9 @@
 | 
			
		||||
#define __PARAMETER_H_
 | 
			
		||||
 | 
			
		||||
#if PRECISION == 1
 | 
			
		||||
#   define MD_FLOAT float
 | 
			
		||||
#   define MD_UINT  unsigned int
 | 
			
		||||
#define MD_FLOAT float
 | 
			
		||||
#else
 | 
			
		||||
#   define MD_FLOAT double
 | 
			
		||||
#   define MD_UINT  unsigned long long int
 | 
			
		||||
#define MD_FLOAT double
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
typedef struct {
 | 
			
		||||
 
 | 
			
		||||
@@ -9,13 +9,10 @@
 | 
			
		||||
#   include <zmmintrin.h>
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#define MD_SIMD_FLOAT       __m512d
 | 
			
		||||
#define MD_SIMD_MASK        __mmask8
 | 
			
		||||
#define MD_SIMD_INT         __m256i
 | 
			
		||||
#define MD_SIMD_BITMASK     MD_SIMD_INT
 | 
			
		||||
#define MD_SIMD_IBOOL       __mmask16
 | 
			
		||||
#define MD_SIMD_FLOAT   __m512d
 | 
			
		||||
#define MD_SIMD_MASK    __mmask8
 | 
			
		||||
#define MD_SIMD_INT     __m256i
 | 
			
		||||
 | 
			
		||||
static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return (__mmask8)(a); }
 | 
			
		||||
static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm512_set1_pd(scalar); }
 | 
			
		||||
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_pd(0.0); }
 | 
			
		||||
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_pd(a, b); }
 | 
			
		||||
 
 | 
			
		||||
@@ -7,30 +7,11 @@
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <string.h>
 | 
			
		||||
#include <immintrin.h>
 | 
			
		||||
#ifndef NO_ZMM_INTRIN
 | 
			
		||||
#   include <zmmintrin.h>
 | 
			
		||||
#endif
 | 
			
		||||
#include <zmmintrin.h>
 | 
			
		||||
 | 
			
		||||
#define MD_SIMD_FLOAT       __m512
 | 
			
		||||
#define MD_SIMD_MASK        __mmask16
 | 
			
		||||
#define MD_SIMD_INT         __m256i
 | 
			
		||||
#define MD_SIMD_IBOOL       __mmask16
 | 
			
		||||
#define MD_SIMD_INT32       __m512i
 | 
			
		||||
#define MD_SIMD_BITMASK     MD_SIMD_INT32
 | 
			
		||||
 | 
			
		||||
static inline MD_SIMD_BITMASK simd_load_bitmask(const int *m) {
 | 
			
		||||
    return _mm512_load_si512(m);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline MD_SIMD_INT32 simd_int32_broadcast(int a) {
 | 
			
		||||
    return _mm512_set1_epi32(a);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline MD_SIMD_IBOOL simd_test_bits(MD_SIMD_FLOAT a) {
 | 
			
		||||
    return _mm512_test_epi32_mask(_mm512_castps_si512(a), _mm512_castps_si512(a));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline MD_SIMD_MASK cvtIB2B(MD_SIMD_IBOOL a) { return a; }
 | 
			
		||||
static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm512_set1_ps(scalar); }
 | 
			
		||||
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_ps(0.0f); }
 | 
			
		||||
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_ps(a, b); }
 | 
			
		||||
@@ -88,7 +69,7 @@ static inline MD_FLOAT simd_h_dual_incr_reduced_sum(float* m, MD_SIMD_FLOAT v0,
 | 
			
		||||
    return _mm_cvtss_f32(t3);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
 | 
			
		||||
inline void simd_h_decr(MD_FLOAT *m, MD_SIMD_FLOAT a) {
 | 
			
		||||
    __m256 t;
 | 
			
		||||
    a = _mm512_add_ps(a, _mm512_shuffle_f32x4(a, a, 0xee));
 | 
			
		||||
    t = _mm256_load_ps(m);
 | 
			
		||||
 
 | 
			
		||||
@@ -131,19 +131,19 @@ void readParameter(Parameter *param, const char *filename) {
 | 
			
		||||
void printParameter(Parameter *param) {
 | 
			
		||||
    printf("Parameters:\n");
 | 
			
		||||
    if(param->input_file != NULL) {
 | 
			
		||||
        printf("\tInput file: %s\n", param->input_file);
 | 
			
		||||
        printf("Input file: %s\n", param->input_file);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if(param->vtk_file != NULL) {
 | 
			
		||||
        printf("\tVTK file: %s\n", param->vtk_file);
 | 
			
		||||
        printf("VTK file: %s\n", param->vtk_file);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if(param->xtc_file != NULL) {
 | 
			
		||||
        printf("\tXTC file: %s\n", param->xtc_file);
 | 
			
		||||
        printf("XTC file: %s\n", param->xtc_file);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if(param->eam_file != NULL) {
 | 
			
		||||
        printf("\tEAM file: %s\n", param->eam_file);
 | 
			
		||||
        printf("EAM file: %s\n", param->eam_file);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    printf("\tForce field: %s\n", ff2str(param->force_field));
 | 
			
		||||
 
 | 
			
		||||
@@ -1,5 +1,5 @@
 | 
			
		||||
# Compiler tag (GCC/CLANG/ICC/ICX/ONEAPI/NVCC)
 | 
			
		||||
TAG ?= ICC
 | 
			
		||||
TAG ?= NVCC
 | 
			
		||||
# Instruction set (SSE/AVX/AVX_FMA/AVX2/AVX512)
 | 
			
		||||
ISA ?= AVX512
 | 
			
		||||
# Optimization scheme (lammps/gromacs/clusters_per_bin)
 | 
			
		||||
@@ -13,7 +13,7 @@ DATA_LAYOUT ?= AOS
 | 
			
		||||
# Assembly syntax to generate (ATT/INTEL)
 | 
			
		||||
ASM_SYNTAX ?= ATT
 | 
			
		||||
# Debug
 | 
			
		||||
DEBUG ?= false
 | 
			
		||||
DEBUG ?= true
 | 
			
		||||
 | 
			
		||||
# Explicitly store and load atom types (true or false)
 | 
			
		||||
EXPLICIT_TYPES ?= false
 | 
			
		||||
@@ -41,6 +41,7 @@ HALF_NEIGHBOR_LISTS_CHECK_CJ ?= false
 | 
			
		||||
# Configurations for CUDA
 | 
			
		||||
# Use CUDA host memory to optimize transfers
 | 
			
		||||
USE_CUDA_HOST_MEMORY ?= false
 | 
			
		||||
USE_SUPER_CLUSTERS ?= true
 | 
			
		||||
 | 
			
		||||
#Feature options
 | 
			
		||||
OPTIONS =  -DALIGNMENT=64
 | 
			
		||||
 
 | 
			
		||||
@@ -7,6 +7,6 @@ temp 80
 | 
			
		||||
x_out_freq 500
 | 
			
		||||
v_out_freq 5
 | 
			
		||||
cutforce 0.9
 | 
			
		||||
skin 0.05
 | 
			
		||||
skin 0.0
 | 
			
		||||
reneigh_every 100
 | 
			
		||||
nstat 125000
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										142
									
								
								gromacs/atom.c
									
									
									
									
									
								
							
							
						
						
									
										142
									
								
								gromacs/atom.c
									
									
									
									
									
								
							@@ -37,7 +37,24 @@ void initAtom(Atom *atom) {
 | 
			
		||||
    atom->iclusters = NULL;
 | 
			
		||||
    atom->jclusters = NULL;
 | 
			
		||||
    atom->icluster_bin = NULL;
 | 
			
		||||
    initMasks(atom);
 | 
			
		||||
 | 
			
		||||
#ifdef USE_SUPER_CLUSTERS
 | 
			
		||||
    atom->scl_x = NULL;
 | 
			
		||||
    atom->scl_v = NULL;
 | 
			
		||||
    atom->scl_f = NULL;
 | 
			
		||||
 | 
			
		||||
    atom->Nsclusters = 0;
 | 
			
		||||
    atom->Nsclusters_local = 0;
 | 
			
		||||
    atom->Nsclusters_ghost = 0;
 | 
			
		||||
    atom->Nsclusters_max = 0;
 | 
			
		||||
 | 
			
		||||
    atom->scl_type = NULL;
 | 
			
		||||
 | 
			
		||||
    atom->siclusters = NULL;
 | 
			
		||||
    atom->icluster_idx = NULL;
 | 
			
		||||
 | 
			
		||||
    atom->sicluster_bin = NULL;
 | 
			
		||||
#endif //USE_SUPER_CLUSTERS
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void createAtom(Atom *atom, Parameter *param) {
 | 
			
		||||
@@ -51,7 +68,6 @@ void createAtom(Atom *atom, Parameter *param) {
 | 
			
		||||
    atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
 | 
			
		||||
    atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
 | 
			
		||||
    atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
 | 
			
		||||
 | 
			
		||||
    for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
 | 
			
		||||
        atom->epsilon[i] = param->epsilon;
 | 
			
		||||
        atom->sigma6[i] = param->sigma6;
 | 
			
		||||
@@ -394,113 +410,6 @@ int readAtom_dmp(Atom* atom, Parameter* param) {
 | 
			
		||||
    return natoms;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void initMasks(Atom *atom) {
 | 
			
		||||
    const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
 | 
			
		||||
    unsigned int mask0, mask1, mask2, mask3;
 | 
			
		||||
 | 
			
		||||
    atom->exclusion_filter = allocate(ALIGNMENT, CLUSTER_M * VECTOR_WIDTH * sizeof(MD_UINT));
 | 
			
		||||
    atom->diagonal_4xn_j_minus_i = allocate(ALIGNMENT, MAX(CLUSTER_M, VECTOR_WIDTH) * sizeof(MD_UINT));
 | 
			
		||||
    atom->diagonal_2xnn_j_minus_i = allocate(ALIGNMENT, VECTOR_WIDTH * sizeof(MD_UINT));
 | 
			
		||||
    //atom->masks_2xnn = allocate(ALIGNMENT, 8 * sizeof(unsigned int));
 | 
			
		||||
 | 
			
		||||
    for(int j = 0; j < MAX(CLUSTER_M, VECTOR_WIDTH); j++) {
 | 
			
		||||
        atom->diagonal_4xn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    for(int j = 0; j < VECTOR_WIDTH / 2; j++) {
 | 
			
		||||
        atom->diagonal_2xnn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5;
 | 
			
		||||
        atom->diagonal_2xnn_j_minus_i[VECTOR_WIDTH / 2 + j] = (MD_FLOAT)(j - 1) - 0.5;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    for(int i = 0; i < CLUSTER_M * VECTOR_WIDTH; i++) {
 | 
			
		||||
        atom->exclusion_filter[i] = (1U << i);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    #if CLUSTER_M == CLUSTER_N
 | 
			
		||||
    for(unsigned int cond0 = 0; cond0 < 2; cond0++) {
 | 
			
		||||
        mask0 = (unsigned int)(0xf - 0x1 * cond0);
 | 
			
		||||
        mask1 = (unsigned int)(0xf - 0x3 * cond0);
 | 
			
		||||
        mask2 = (unsigned int)(0xf - 0x7 * cond0);
 | 
			
		||||
        mask3 = (unsigned int)(0xf - 0xf * cond0);
 | 
			
		||||
        atom->masks_2xnn_hn[cond0 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
 | 
			
		||||
        atom->masks_2xnn_hn[cond0 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
 | 
			
		||||
 | 
			
		||||
        mask0 = (unsigned int)(0xf - 0x1 * cond0);
 | 
			
		||||
        mask1 = (unsigned int)(0xf - 0x2 * cond0);
 | 
			
		||||
        mask2 = (unsigned int)(0xf - 0x4 * cond0);
 | 
			
		||||
        mask3 = (unsigned int)(0xf - 0x8 * cond0);
 | 
			
		||||
        atom->masks_2xnn_fn[cond0 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
 | 
			
		||||
        atom->masks_2xnn_fn[cond0 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
 | 
			
		||||
 | 
			
		||||
        atom->masks_4xn_hn[cond0 * 4 + 0] = (unsigned int)(0xf - 0x1 * cond0);
 | 
			
		||||
        atom->masks_4xn_hn[cond0 * 4 + 1] = (unsigned int)(0xf - 0x3 * cond0);
 | 
			
		||||
        atom->masks_4xn_hn[cond0 * 4 + 2] = (unsigned int)(0xf - 0x7 * cond0);
 | 
			
		||||
        atom->masks_4xn_hn[cond0 * 4 + 3] = (unsigned int)(0xf - 0xf * cond0);
 | 
			
		||||
 | 
			
		||||
        atom->masks_4xn_fn[cond0 * 4 + 0] = (unsigned int)(0xf - 0x1 * cond0);
 | 
			
		||||
        atom->masks_4xn_fn[cond0 * 4 + 1] = (unsigned int)(0xf - 0x2 * cond0);
 | 
			
		||||
        atom->masks_4xn_fn[cond0 * 4 + 2] = (unsigned int)(0xf - 0x4 * cond0);
 | 
			
		||||
        atom->masks_4xn_fn[cond0 * 4 + 3] = (unsigned int)(0xf - 0x8 * cond0);
 | 
			
		||||
    }
 | 
			
		||||
    #else
 | 
			
		||||
    for(unsigned int cond0 = 0; cond0 < 2; cond0++) {
 | 
			
		||||
        for(unsigned int cond1 = 0; cond1 < 2; cond1++) {
 | 
			
		||||
            #if CLUSTER_M < CLUSTER_N
 | 
			
		||||
            mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
 | 
			
		||||
            mask1 = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
 | 
			
		||||
            mask2 = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
 | 
			
		||||
            mask3 = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
 | 
			
		||||
            #else
 | 
			
		||||
            mask0 = (unsigned int)(0x3 - 0x1 * cond0);
 | 
			
		||||
            mask1 = (unsigned int)(0x3 - 0x3 * cond0);
 | 
			
		||||
            mask2 = (unsigned int)(0x3 - cond0 * 0x3 - 0x1 * cond1);
 | 
			
		||||
            mask3 = (unsigned int)(0x3 - cond0 * 0x3 - 0x3 * cond1);
 | 
			
		||||
            #endif
 | 
			
		||||
 | 
			
		||||
            atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
 | 
			
		||||
            atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
 | 
			
		||||
 | 
			
		||||
            #if CLUSTER_M < CLUSTER_N
 | 
			
		||||
            mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
 | 
			
		||||
            mask1 = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
 | 
			
		||||
            mask2 = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
 | 
			
		||||
            mask3 = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
 | 
			
		||||
            #else
 | 
			
		||||
            mask0 = (unsigned int)(0x3 - 0x1 * cond0);
 | 
			
		||||
            mask1 = (unsigned int)(0x3 - 0x2 * cond0);
 | 
			
		||||
            mask2 = (unsigned int)(0x3 - 0x1 * cond1);
 | 
			
		||||
            mask3 = (unsigned int)(0x3 - 0x2 * cond1);
 | 
			
		||||
            #endif
 | 
			
		||||
 | 
			
		||||
            atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 0] = (mask1 << half_mask_bits) | mask0;
 | 
			
		||||
            atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 1] = (mask3 << half_mask_bits) | mask2;
 | 
			
		||||
 | 
			
		||||
            #if CLUSTER_M < CLUSTER_N
 | 
			
		||||
            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
 | 
			
		||||
            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
 | 
			
		||||
            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
 | 
			
		||||
            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
 | 
			
		||||
 | 
			
		||||
            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
 | 
			
		||||
            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
 | 
			
		||||
            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
 | 
			
		||||
            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
 | 
			
		||||
            #else
 | 
			
		||||
            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond0);
 | 
			
		||||
            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 1] = (unsigned int)(0x3 - 0x3 * cond0);
 | 
			
		||||
            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 2] = (unsigned int)(0x3 - 0x3 * cond0 - 0x1 * cond1);
 | 
			
		||||
            atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 3] = (unsigned int)(0x3 - 0x3 * cond0 - 0x3 * cond1);
 | 
			
		||||
 | 
			
		||||
            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond0);
 | 
			
		||||
            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x2 * cond0);
 | 
			
		||||
            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x1 * cond1);
 | 
			
		||||
            atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0] = (unsigned int)(0x3 - 0x2 * cond1);
 | 
			
		||||
            #endif
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
    #endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void growAtom(Atom *atom) {
 | 
			
		||||
    int nold = atom->Nmax;
 | 
			
		||||
    atom->Nmax += DELTA;
 | 
			
		||||
@@ -530,3 +439,18 @@ void growClusters(Atom *atom) {
 | 
			
		||||
    atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
 | 
			
		||||
    atom->cl_type = (int*) reallocate(atom->cl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * sizeof(int), nold * CLUSTER_M * sizeof(int));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#ifdef USE_SUPER_CLUSTERS
 | 
			
		||||
void growSuperClusters(Atom *atom) {
 | 
			
		||||
    int nold = atom->Nsclusters_max;
 | 
			
		||||
    atom->Nsclusters_max += DELTA;
 | 
			
		||||
    atom->siclusters = (SuperCluster*) reallocate(atom->siclusters, ALIGNMENT, atom->Nsclusters_max * sizeof(SuperCluster), nold * sizeof(SuperCluster));
 | 
			
		||||
    atom->icluster_idx = (int*) reallocate(atom->icluster_idx, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int), nold * SCLUSTER_SIZE * sizeof(int));
 | 
			
		||||
    atom->sicluster_bin = (int*) reallocate(atom->sicluster_bin, ALIGNMENT, atom->Nsclusters_max * sizeof(int), nold * sizeof(int));
 | 
			
		||||
    //atom->scl_type = (int*) reallocate(atom->scl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * SCLUSTER_SIZE * sizeof(int), nold * CLUSTER_M * SCLUSTER_SIZE * sizeof(int));
 | 
			
		||||
 | 
			
		||||
    atom->scl_x = (MD_FLOAT*) reallocate(atom->scl_x, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT), nold * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
 | 
			
		||||
    atom->scl_f = (MD_FLOAT*) reallocate(atom->scl_f, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT), nold * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
 | 
			
		||||
    atom->scl_v = (MD_FLOAT*) reallocate(atom->scl_v, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT), nold * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
 | 
			
		||||
}
 | 
			
		||||
#endif //USE_SUPER_CLUSTERS
 | 
			
		||||
 
 | 
			
		||||
@@ -39,8 +39,29 @@ extern "C" {
 | 
			
		||||
    MD_FLOAT *cuda_bbminz, *cuda_bbmaxz;
 | 
			
		||||
    int *cuda_PBCx, *cuda_PBCy, *cuda_PBCz;
 | 
			
		||||
    int isReneighboured;
 | 
			
		||||
 | 
			
		||||
    int *cuda_iclusters;
 | 
			
		||||
    int *cuda_nclusters;
 | 
			
		||||
 | 
			
		||||
    int cuda_max_scl;
 | 
			
		||||
    MD_FLOAT *cuda_scl_x;
 | 
			
		||||
    MD_FLOAT *cuda_scl_v;
 | 
			
		||||
    MD_FLOAT *cuda_scl_f;
 | 
			
		||||
 | 
			
		||||
    extern void alignDataToSuperclusters(Atom *atom);
 | 
			
		||||
    extern void alignDataFromSuperclusters(Atom *atom);
 | 
			
		||||
    extern double computeForceLJSup_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
extern __global__ void cudaInitialIntegrateSup_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
 | 
			
		||||
                                                    int *cuda_nclusters,
 | 
			
		||||
                                                    int *cuda_natoms,
 | 
			
		||||
                                                    int Nsclusters_local, MD_FLOAT dtforce, MD_FLOAT dt);
 | 
			
		||||
 | 
			
		||||
extern __global__ void cudaFinalIntegrateSup_warp(MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
 | 
			
		||||
                                                  int *cuda_nclusters, int *cuda_natoms,
 | 
			
		||||
                                                  int Nsclusters_local, MD_FLOAT dtforce);
 | 
			
		||||
 | 
			
		||||
extern "C"
 | 
			
		||||
void initDevice(Atom *atom, Neighbor *neighbor) {
 | 
			
		||||
    cuda_assert("cudaDeviceSetup", cudaDeviceReset());
 | 
			
		||||
@@ -59,10 +80,23 @@ void initDevice(Atom *atom, Neighbor *neighbor) {
 | 
			
		||||
    natoms                  =   (int *) malloc(atom->Nclusters_max * sizeof(int));
 | 
			
		||||
    ngatoms                 =   (int *) malloc(atom->Nclusters_max * sizeof(int));
 | 
			
		||||
    isReneighboured = 1;
 | 
			
		||||
 | 
			
		||||
#ifdef USE_SUPER_CLUSTERS
 | 
			
		||||
    cuda_max_scl            =   atom->Nsclusters_max;
 | 
			
		||||
    cuda_iclusters          =   (int *) allocateGPU(atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int));
 | 
			
		||||
    cuda_nclusters          =   (int *) allocateGPU(atom->Nsclusters_max * sizeof(int));
 | 
			
		||||
 | 
			
		||||
    cuda_scl_x              =   (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
 | 
			
		||||
    cuda_scl_v              =   (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
 | 
			
		||||
    cuda_scl_f              =   (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
 | 
			
		||||
 | 
			
		||||
#endif //USE_SUPER_CLUSTERS
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
extern "C"
 | 
			
		||||
void copyDataToCUDADevice(Atom *atom) {
 | 
			
		||||
    DEBUG_MESSAGE("copyDataToCUDADevice start\r\n");
 | 
			
		||||
 | 
			
		||||
    memcpyToGPU(cuda_cl_x, atom->cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
 | 
			
		||||
    memcpyToGPU(cuda_cl_v, atom->cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
 | 
			
		||||
    memcpyToGPU(cuda_cl_f, atom->cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
 | 
			
		||||
@@ -85,13 +119,49 @@ void copyDataToCUDADevice(Atom *atom) {
 | 
			
		||||
    memcpyToGPU(cuda_PBCx, atom->PBCx, atom->Nclusters_ghost * sizeof(int));
 | 
			
		||||
    memcpyToGPU(cuda_PBCy, atom->PBCy, atom->Nclusters_ghost * sizeof(int));
 | 
			
		||||
    memcpyToGPU(cuda_PBCz, atom->PBCz, atom->Nclusters_ghost * sizeof(int));
 | 
			
		||||
 | 
			
		||||
#ifdef USE_SUPER_CLUSTERS
 | 
			
		||||
    //alignDataToSuperclusters(atom);
 | 
			
		||||
 | 
			
		||||
    if (cuda_max_scl < atom->Nsclusters_max) {
 | 
			
		||||
        cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_x));
 | 
			
		||||
        cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_v));
 | 
			
		||||
        cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_f));
 | 
			
		||||
        cuda_max_scl            =   atom->Nsclusters_max;
 | 
			
		||||
 | 
			
		||||
        cuda_iclusters          =   (int *) allocateGPU(atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int));
 | 
			
		||||
        cuda_nclusters          =   (int *) allocateGPU(atom->Nsclusters_max * sizeof(int));
 | 
			
		||||
 | 
			
		||||
        cuda_scl_x              =   (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
 | 
			
		||||
        cuda_scl_v              =   (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
 | 
			
		||||
        cuda_scl_f              =   (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
 | 
			
		||||
    }
 | 
			
		||||
    memcpyToGPU(cuda_scl_x, atom->scl_x, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
 | 
			
		||||
    memcpyToGPU(cuda_scl_v, atom->scl_v, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
 | 
			
		||||
    memcpyToGPU(cuda_scl_f, atom->scl_f, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
 | 
			
		||||
#endif //USE_SUPER_CLUSTERS
 | 
			
		||||
 | 
			
		||||
    DEBUG_MESSAGE("copyDataToCUDADevice stop\r\n");
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
extern "C"
 | 
			
		||||
void copyDataFromCUDADevice(Atom *atom) {
 | 
			
		||||
    DEBUG_MESSAGE("copyDataFromCUDADevice start\r\n");
 | 
			
		||||
 | 
			
		||||
    memcpyFromGPU(atom->cl_x, cuda_cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
 | 
			
		||||
    memcpyFromGPU(atom->cl_v, cuda_cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
 | 
			
		||||
    memcpyFromGPU(atom->cl_f, cuda_cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
 | 
			
		||||
 | 
			
		||||
#ifdef USE_SUPER_CLUSTERS
 | 
			
		||||
    memcpyFromGPU(atom->scl_x, cuda_scl_x, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
 | 
			
		||||
    memcpyFromGPU(atom->scl_v, cuda_scl_v, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
 | 
			
		||||
    memcpyFromGPU(atom->scl_f, cuda_scl_f, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
 | 
			
		||||
 | 
			
		||||
    //alignDataFromSuperclusters(atom);
 | 
			
		||||
#endif //USE_SUPER_CLUSTERS
 | 
			
		||||
 | 
			
		||||
    DEBUG_MESSAGE("copyDataFromCUDADevice stop\r\n");
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
extern "C"
 | 
			
		||||
@@ -109,6 +179,12 @@ void cudaDeviceFree() {
 | 
			
		||||
    cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCz));
 | 
			
		||||
    free(natoms);
 | 
			
		||||
    free(ngatoms);
 | 
			
		||||
 | 
			
		||||
#ifdef USE_SUPER_CLUSTERS
 | 
			
		||||
    cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_x));
 | 
			
		||||
    cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_v));
 | 
			
		||||
    cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_f));
 | 
			
		||||
#endif //USE_SUPER_CLUSTERS
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
__global__ void cudaInitialIntegrate_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
 | 
			
		||||
@@ -165,6 +241,39 @@ __global__ void cudaUpdatePbc_warp(MD_FLOAT *cuda_cl_x, int *cuda_border_map,
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
__global__ void cudaUpdatePbcSup_warp(MD_FLOAT *cuda_cl_x, int *cuda_border_map,
 | 
			
		||||
                                   int *cuda_jclusters_natoms,
 | 
			
		||||
                                   int *cuda_PBCx,
 | 
			
		||||
                                   int *cuda_PBCy,
 | 
			
		||||
                                   int *cuda_PBCz,
 | 
			
		||||
                                   int Nsclusters_local,
 | 
			
		||||
                                   int Nclusters_ghost,
 | 
			
		||||
                                   MD_FLOAT param_xprd,
 | 
			
		||||
                                   MD_FLOAT param_yprd,
 | 
			
		||||
                                   MD_FLOAT param_zprd) {
 | 
			
		||||
    unsigned int cg = blockDim.x * blockIdx.x + threadIdx.x;
 | 
			
		||||
    if (cg >= Nclusters_ghost) return;
 | 
			
		||||
 | 
			
		||||
    //int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
 | 
			
		||||
    int jfac = SCLUSTER_SIZE / CLUSTER_M;
 | 
			
		||||
    int ncj = Nsclusters_local / jfac;
 | 
			
		||||
    MD_FLOAT xprd = param_xprd;
 | 
			
		||||
    MD_FLOAT yprd = param_yprd;
 | 
			
		||||
    MD_FLOAT zprd = param_zprd;
 | 
			
		||||
 | 
			
		||||
    const int cj = ncj + cg;
 | 
			
		||||
    int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
 | 
			
		||||
    int bmap_vec_base = CJ_VECTOR_BASE_INDEX(cuda_border_map[cg]);
 | 
			
		||||
    MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];
 | 
			
		||||
    MD_FLOAT *bmap_x = &cuda_cl_x[bmap_vec_base];
 | 
			
		||||
 | 
			
		||||
    for(int cjj = 0; cjj < cuda_jclusters_natoms[cg]; cjj++) {
 | 
			
		||||
        cj_x[CL_X_OFFSET + cjj] = bmap_x[CL_X_OFFSET + cjj] + cuda_PBCx[cg] * xprd;
 | 
			
		||||
        cj_x[CL_Y_OFFSET + cjj] = bmap_x[CL_Y_OFFSET + cjj] + cuda_PBCy[cg] * yprd;
 | 
			
		||||
        cj_x[CL_Z_OFFSET + cjj] = bmap_x[CL_Z_OFFSET + cjj] + cuda_PBCz[cg] * zprd;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
__global__ void computeForceLJ_cuda_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_f,
 | 
			
		||||
                                         int Nclusters_local, int Nclusters_max,
 | 
			
		||||
                                         int *cuda_numneigh, int *cuda_neighs, int half_neigh, int maxneighs,
 | 
			
		||||
@@ -251,9 +360,17 @@ extern "C"
 | 
			
		||||
void cudaInitialIntegrate(Parameter *param, Atom *atom) {
 | 
			
		||||
    const int threads_num = 16;
 | 
			
		||||
    dim3 block_size = dim3(threads_num, 1, 1);
 | 
			
		||||
 | 
			
		||||
    #ifdef USE_SUPER_CLUSTERS
 | 
			
		||||
    dim3 grid_size = dim3(atom->Nsclusters_local/(threads_num)+1, 1, 1);
 | 
			
		||||
    cudaInitialIntegrateSup_warp<<<grid_size, block_size>>>(cuda_scl_x, cuda_scl_v, cuda_scl_f,
 | 
			
		||||
                                                            cuda_nclusters,
 | 
			
		||||
                                                            cuda_natoms, atom->Nsclusters_local, param->dtforce, param->dt);
 | 
			
		||||
    #else
 | 
			
		||||
    dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1);
 | 
			
		||||
    cudaInitialIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_cl_v, cuda_cl_f,
 | 
			
		||||
                                                         cuda_natoms, atom->Nclusters_local, param->dtforce, param->dt);
 | 
			
		||||
    #endif //USE_SUPER_CLUSTERS
 | 
			
		||||
    cuda_assert("cudaInitialIntegrate", cudaPeekAtLastError());
 | 
			
		||||
    cuda_assert("cudaInitialIntegrate", cudaDeviceSynchronize());
 | 
			
		||||
}
 | 
			
		||||
@@ -264,11 +381,19 @@ extern "C"
 | 
			
		||||
void cudaUpdatePbc(Atom *atom, Parameter *param) {
 | 
			
		||||
    const int threads_num = 512;
 | 
			
		||||
    dim3 block_size = dim3(threads_num, 1, 1);;
 | 
			
		||||
    dim3 grid_size = dim3(atom->Nclusters_ghost/(threads_num)+1, 1, 1);;
 | 
			
		||||
    cudaUpdatePbc_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_border_map,
 | 
			
		||||
    dim3 grid_size = dim3(atom->Nclusters_ghost/(threads_num)+1, 1, 1);
 | 
			
		||||
 | 
			
		||||
#ifdef USE_SUPER_CLUSTERS
 | 
			
		||||
    cudaUpdatePbcSup_warp<<<grid_size, block_size>>>(cuda_scl_x, cuda_border_map,
 | 
			
		||||
                                       cuda_jclusters_natoms, cuda_PBCx, cuda_PBCy, cuda_PBCz,
 | 
			
		||||
                                       atom->Nclusters_local, atom->Nclusters_ghost,
 | 
			
		||||
                                       param->xprd, param->yprd, param->zprd);
 | 
			
		||||
#else
 | 
			
		||||
    cudaUpdatePbc_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_border_map,
 | 
			
		||||
                                                  cuda_jclusters_natoms, cuda_PBCx, cuda_PBCy, cuda_PBCz,
 | 
			
		||||
                                                  atom->Nclusters_local, atom->Nclusters_ghost,
 | 
			
		||||
                                                  param->xprd, param->yprd, param->zprd);
 | 
			
		||||
#endif //USE_SUPER_CLUSTERS
 | 
			
		||||
    cuda_assert("cudaUpdatePbc", cudaPeekAtLastError());
 | 
			
		||||
    cuda_assert("cudaUpdatePbc", cudaDeviceSynchronize());
 | 
			
		||||
}
 | 
			
		||||
@@ -310,8 +435,17 @@ extern "C"
 | 
			
		||||
void cudaFinalIntegrate(Parameter *param, Atom *atom) {
 | 
			
		||||
    const int threads_num = 16;
 | 
			
		||||
    dim3 block_size = dim3(threads_num, 1, 1);
 | 
			
		||||
 | 
			
		||||
    #ifdef USE_SUPER_CLUSTERS
 | 
			
		||||
    dim3 grid_size = dim3(atom->Nsclusters_local/(threads_num)+1, 1, 1);
 | 
			
		||||
    cudaFinalIntegrateSup_warp<<<grid_size, block_size>>>(cuda_scl_v, cuda_scl_f,
 | 
			
		||||
                                                          cuda_nclusters, cuda_natoms,
 | 
			
		||||
                                                          atom->Nsclusters_local, param->dt);
 | 
			
		||||
    #else
 | 
			
		||||
    dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1);
 | 
			
		||||
    cudaFinalIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_v, cuda_cl_f, cuda_natoms, atom->Nclusters_local, param->dt);
 | 
			
		||||
    cudaFinalIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_v, cuda_cl_f, cuda_natoms,
 | 
			
		||||
                                                          atom->Nclusters_local, param->dt);
 | 
			
		||||
    #endif //USE_SUPER_CLUSTERS
 | 
			
		||||
    cuda_assert("cudaFinalIntegrate", cudaPeekAtLastError());
 | 
			
		||||
    cuda_assert("cudaFinalIntegrate", cudaDeviceSynchronize());
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										288
									
								
								gromacs/cuda/force_lj_sup.cu
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										288
									
								
								gromacs/cuda/force_lj_sup.cu
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,288 @@
 | 
			
		||||
 | 
			
		||||
extern "C" {
 | 
			
		||||
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
//---
 | 
			
		||||
#include <cuda.h>
 | 
			
		||||
#include <driver_types.h>
 | 
			
		||||
//---
 | 
			
		||||
#include <likwid-marker.h>
 | 
			
		||||
//---
 | 
			
		||||
#include <atom.h>
 | 
			
		||||
#include <device.h>
 | 
			
		||||
#include <neighbor.h>
 | 
			
		||||
#include <parameter.h>
 | 
			
		||||
#include <stats.h>
 | 
			
		||||
#include <timing.h>
 | 
			
		||||
#include <util.h>
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
extern "C" {
 | 
			
		||||
    extern MD_FLOAT *cuda_cl_x;
 | 
			
		||||
    extern MD_FLOAT *cuda_cl_v;
 | 
			
		||||
    extern MD_FLOAT *cuda_cl_f;
 | 
			
		||||
    extern int *cuda_neighbors;
 | 
			
		||||
    extern int *cuda_numneigh;
 | 
			
		||||
    extern int *cuda_natoms;
 | 
			
		||||
    extern int *natoms;
 | 
			
		||||
    extern int *ngatoms;
 | 
			
		||||
    extern int *cuda_border_map;
 | 
			
		||||
    extern int *cuda_jclusters_natoms;
 | 
			
		||||
    extern MD_FLOAT *cuda_bbminx, *cuda_bbmaxx;
 | 
			
		||||
    extern MD_FLOAT *cuda_bbminy, *cuda_bbmaxy;
 | 
			
		||||
    extern MD_FLOAT *cuda_bbminz, *cuda_bbmaxz;
 | 
			
		||||
    extern int *cuda_PBCx, *cuda_PBCy, *cuda_PBCz;
 | 
			
		||||
    extern int isReneighboured;
 | 
			
		||||
 | 
			
		||||
    extern int *cuda_iclusters;
 | 
			
		||||
    extern int *cuda_nclusters;
 | 
			
		||||
 | 
			
		||||
    extern MD_FLOAT *cuda_scl_x;
 | 
			
		||||
    extern MD_FLOAT *cuda_scl_v;
 | 
			
		||||
    extern MD_FLOAT *cuda_scl_f;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#ifdef USE_SUPER_CLUSTERS
 | 
			
		||||
extern "C"
 | 
			
		||||
void alignDataToSuperclusters(Atom *atom) {
 | 
			
		||||
    for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
 | 
			
		||||
        const unsigned int scl_offset = sci * SCLUSTER_SIZE * 3 * CLUSTER_M;
 | 
			
		||||
 | 
			
		||||
        for (int ci = 0, scci = scl_offset; ci < atom->siclusters[sci].nclusters; ci++, scci += CLUSTER_M) {
 | 
			
		||||
 | 
			
		||||
            MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
 | 
			
		||||
            MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
 | 
			
		||||
            MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
 | 
			
		||||
 | 
			
		||||
            /*
 | 
			
		||||
            MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
 | 
			
		||||
            MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
 | 
			
		||||
            MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
 | 
			
		||||
             */
 | 
			
		||||
 | 
			
		||||
            memcpy(&atom->scl_x[scci], &ci_x[0], CLUSTER_M * sizeof(MD_FLOAT));
 | 
			
		||||
            memcpy(&atom->scl_x[scci + SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
 | 
			
		||||
            memcpy(&atom->scl_x[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
 | 
			
		||||
 | 
			
		||||
            memcpy(&atom->scl_v[scci], &ci_v[0], CLUSTER_M * sizeof(MD_FLOAT));
 | 
			
		||||
            memcpy(&atom->scl_v[scci + SCLUSTER_SIZE * CLUSTER_M], &ci_v[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
 | 
			
		||||
            memcpy(&atom->scl_v[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_v[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
 | 
			
		||||
 | 
			
		||||
            memcpy(&atom->scl_f[scci], &ci_f[0], CLUSTER_M * sizeof(MD_FLOAT));
 | 
			
		||||
            memcpy(&atom->scl_f[scci + SCLUSTER_SIZE * CLUSTER_M], &ci_f[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
 | 
			
		||||
            memcpy(&atom->scl_f[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_f[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
 | 
			
		||||
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
extern "C"
 | 
			
		||||
void alignDataFromSuperclusters(Atom *atom) {
 | 
			
		||||
    for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
 | 
			
		||||
        const unsigned int scl_offset = sci * SCLUSTER_SIZE * 3 * CLUSTER_M;
 | 
			
		||||
 | 
			
		||||
        for (int ci = 0, scci = scl_offset; ci < atom->siclusters[sci].nclusters; ci++, scci += CLUSTER_M) {
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
            MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
 | 
			
		||||
            MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
 | 
			
		||||
            MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
 | 
			
		||||
 | 
			
		||||
            /*
 | 
			
		||||
            MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
 | 
			
		||||
            MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
 | 
			
		||||
            MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
 | 
			
		||||
             */
 | 
			
		||||
 | 
			
		||||
            memcpy(&ci_x[0], &atom->scl_x[scci], CLUSTER_M * sizeof(MD_FLOAT));
 | 
			
		||||
            memcpy(&ci_x[0 + CLUSTER_M], &atom->scl_x[scci + SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
 | 
			
		||||
            memcpy(&ci_x[0 + 2 * CLUSTER_M], &atom->scl_x[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
 | 
			
		||||
 | 
			
		||||
            memcpy(&ci_v[0], &atom->scl_v[scci], CLUSTER_M * sizeof(MD_FLOAT));
 | 
			
		||||
            memcpy(&ci_v[0 + CLUSTER_M], &atom->scl_v[scci + SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
 | 
			
		||||
            memcpy(&ci_v[0 + 2 * CLUSTER_M], &atom->scl_v[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
 | 
			
		||||
 | 
			
		||||
            memcpy(&ci_f[0], &atom->scl_f[scci], CLUSTER_M * sizeof(MD_FLOAT));
 | 
			
		||||
            memcpy(&ci_f[0 + CLUSTER_M], &atom->scl_f[scci + SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
 | 
			
		||||
            memcpy(&ci_f[0 + 2 * CLUSTER_M], &atom->scl_f[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
 | 
			
		||||
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
__global__ void cudaInitialIntegrateSup_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
 | 
			
		||||
                                             int *cuda_nclusters,
 | 
			
		||||
                                             int *cuda_natoms,
 | 
			
		||||
                                             int Nsclusters_local, MD_FLOAT dtforce, MD_FLOAT dt) {
 | 
			
		||||
 | 
			
		||||
    unsigned int sci_pos = blockDim.x * blockIdx.x + threadIdx.x;
 | 
			
		||||
    //unsigned int cii_pos = blockDim.y * blockIdx.y + threadIdx.y;
 | 
			
		||||
    if (sci_pos >= Nsclusters_local) return;
 | 
			
		||||
 | 
			
		||||
    //unsigned int ci_pos = cii_pos / CLUSTER_M;
 | 
			
		||||
    //unsigned int scii_pos = cii_pos % CLUSTER_M;
 | 
			
		||||
 | 
			
		||||
    //if (ci_pos >= cuda_nclusters[sci_pos]) return;
 | 
			
		||||
    //if (scii_pos >= cuda_natoms[ci_pos]) return;
 | 
			
		||||
 | 
			
		||||
    int ci_vec_base = SCI_VECTOR_BASE_INDEX(sci_pos);
 | 
			
		||||
    MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];
 | 
			
		||||
    MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];
 | 
			
		||||
    MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
 | 
			
		||||
 | 
			
		||||
    for (int scii_pos = 0; scii_pos < SCLUSTER_M; scii_pos++) {
 | 
			
		||||
        ci_v[SCL_X_OFFSET + scii_pos] += dtforce * ci_f[SCL_X_OFFSET + scii_pos];
 | 
			
		||||
        ci_v[SCL_Y_OFFSET + scii_pos] += dtforce * ci_f[SCL_Y_OFFSET + scii_pos];
 | 
			
		||||
        ci_v[SCL_Z_OFFSET + scii_pos] += dtforce * ci_f[SCL_Z_OFFSET + scii_pos];
 | 
			
		||||
        ci_x[SCL_X_OFFSET + scii_pos] += dt * ci_v[SCL_X_OFFSET + scii_pos];
 | 
			
		||||
        ci_x[SCL_Y_OFFSET + scii_pos] += dt * ci_v[SCL_Y_OFFSET + scii_pos];
 | 
			
		||||
        ci_x[SCL_Z_OFFSET + scii_pos] += dt * ci_v[SCL_Z_OFFSET + scii_pos];
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
__global__ void cudaFinalIntegrateSup_warp(MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
 | 
			
		||||
                                           int *cuda_nclusters, int *cuda_natoms,
 | 
			
		||||
                                           int Nsclusters_local, MD_FLOAT dtforce) {
 | 
			
		||||
 | 
			
		||||
    unsigned int sci_pos = blockDim.x * blockIdx.x + threadIdx.x;
 | 
			
		||||
    //unsigned int cii_pos = blockDim.y * blockIdx.y + threadIdx.y;
 | 
			
		||||
    if (sci_pos >= Nsclusters_local) return;
 | 
			
		||||
 | 
			
		||||
    //unsigned int ci_pos = cii_pos / CLUSTER_M;
 | 
			
		||||
    //unsigned int scii_pos = cii_pos % CLUSTER_M;
 | 
			
		||||
 | 
			
		||||
    //if (ci_pos >= cuda_nclusters[sci_pos]) return;
 | 
			
		||||
    //if (scii_pos >= cuda_natoms[ci_pos]) return;
 | 
			
		||||
 | 
			
		||||
    int ci_vec_base = SCI_VECTOR_BASE_INDEX(sci_pos);
 | 
			
		||||
    MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];
 | 
			
		||||
    MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
 | 
			
		||||
 | 
			
		||||
    for (int scii_pos = 0; scii_pos < SCLUSTER_M; scii_pos++) {
 | 
			
		||||
        ci_v[SCL_X_OFFSET + scii_pos] += dtforce * ci_f[SCL_X_OFFSET + scii_pos];
 | 
			
		||||
        ci_v[SCL_Y_OFFSET + scii_pos] += dtforce * ci_f[SCL_Y_OFFSET + scii_pos];
 | 
			
		||||
        ci_v[SCL_Z_OFFSET + scii_pos] += dtforce * ci_f[SCL_Z_OFFSET + scii_pos];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
__global__ void computeForceLJSup_cuda_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_f,
 | 
			
		||||
                                            int *cuda_nclusters, int *cuda_iclusters,
 | 
			
		||||
                                            int Nsclusters_local,
 | 
			
		||||
                                            int *cuda_numneigh, int *cuda_neighs, int half_neigh, int maxneighs,
 | 
			
		||||
                                            MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon) {
 | 
			
		||||
 | 
			
		||||
    unsigned int sci_pos = blockDim.x * blockIdx.x + threadIdx.x;
 | 
			
		||||
    unsigned int scii_pos = blockDim.y * blockIdx.y + threadIdx.y;
 | 
			
		||||
    unsigned int cjj_pos = blockDim.z * blockIdx.z + threadIdx.z;
 | 
			
		||||
    if ((sci_pos >= Nsclusters_local) || (scii_pos >= SCLUSTER_M) || (cjj_pos >= CLUSTER_N)) return;
 | 
			
		||||
 | 
			
		||||
    unsigned int ci_pos = scii_pos / CLUSTER_M;
 | 
			
		||||
    unsigned int cii_pos = scii_pos % CLUSTER_M;
 | 
			
		||||
 | 
			
		||||
    if (ci_pos >= cuda_nclusters[sci_pos]) return;
 | 
			
		||||
 | 
			
		||||
    int ci_cj0 = CJ0_FROM_CI(ci_pos);
 | 
			
		||||
    int ci_vec_base = SCI_VECTOR_BASE_INDEX(sci_pos);
 | 
			
		||||
    MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];
 | 
			
		||||
    MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    //int numneighs = cuda_numneigh[ci_pos];
 | 
			
		||||
    int numneighs = cuda_numneigh[cuda_iclusters[SCLUSTER_SIZE * sci_pos + ci_pos]];
 | 
			
		||||
 | 
			
		||||
    for(int k = 0; k < numneighs; k++) {
 | 
			
		||||
        int glob_j = (&cuda_neighs[cuda_iclusters[SCLUSTER_SIZE * sci_pos + ci_pos] * maxneighs])[k];
 | 
			
		||||
        int scj = glob_j / SCLUSTER_SIZE;
 | 
			
		||||
        // TODO Make cj accessible from super cluster data alignment (not reachable right now)
 | 
			
		||||
        int cj = SCJ_VECTOR_BASE_INDEX(scj) + CLUSTER_M * (glob_j % SCLUSTER_SIZE);
 | 
			
		||||
        int cj_vec_base = cj;
 | 
			
		||||
        MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];
 | 
			
		||||
        MD_FLOAT *cj_f = &cuda_cl_f[cj_vec_base];
 | 
			
		||||
 | 
			
		||||
        MD_FLOAT xtmp = ci_x[SCL_CL_X_OFFSET(ci_pos) + cii_pos];
 | 
			
		||||
        MD_FLOAT ytmp = ci_x[SCL_CL_Y_OFFSET(ci_pos) + cii_pos];
 | 
			
		||||
        MD_FLOAT ztmp = ci_x[SCL_CL_Z_OFFSET(ci_pos) + cii_pos];
 | 
			
		||||
        MD_FLOAT fix = 0;
 | 
			
		||||
        MD_FLOAT fiy = 0;
 | 
			
		||||
        MD_FLOAT fiz = 0;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        //int cond = ci_cj0 != cj || cii_pos != cjj_pos || scj != sci_pos;
 | 
			
		||||
        int cond = (glob_j != cuda_iclusters[SCLUSTER_SIZE * sci_pos + ci_pos] && cii_pos != cjj_pos);
 | 
			
		||||
 | 
			
		||||
        if(cond) {
 | 
			
		||||
            MD_FLOAT delx = xtmp - cj_x[SCL_CL_X_OFFSET(ci_pos) + cjj_pos];
 | 
			
		||||
            MD_FLOAT dely = ytmp - cj_x[SCL_CL_Y_OFFSET(ci_pos) + cjj_pos];
 | 
			
		||||
            MD_FLOAT delz = ztmp - cj_x[SCL_CL_Z_OFFSET(ci_pos) + cjj_pos];
 | 
			
		||||
            MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
 | 
			
		||||
            if(rsq < cutforcesq) {
 | 
			
		||||
                MD_FLOAT sr2 = 1.0 / rsq;
 | 
			
		||||
                MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
 | 
			
		||||
                MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
 | 
			
		||||
 | 
			
		||||
                if(half_neigh) {
 | 
			
		||||
                    atomicAdd(&cj_f[SCL_CL_X_OFFSET(ci_pos) + cjj_pos], -delx * force);
 | 
			
		||||
                    atomicAdd(&cj_f[SCL_CL_Y_OFFSET(ci_pos) + cjj_pos], -dely * force);
 | 
			
		||||
                    atomicAdd(&cj_f[SCL_CL_Z_OFFSET(ci_pos) + cjj_pos], -delz * force);
 | 
			
		||||
                }
 | 
			
		||||
 | 
			
		||||
                fix += delx * force;
 | 
			
		||||
                fiy += dely * force;
 | 
			
		||||
                fiz += delz * force;
 | 
			
		||||
 | 
			
		||||
                atomicAdd(&ci_f[SCL_CL_X_OFFSET(ci_pos) + cii_pos], fix);
 | 
			
		||||
                atomicAdd(&ci_f[SCL_CL_Y_OFFSET(ci_pos) + cii_pos], fiy);
 | 
			
		||||
                atomicAdd(&ci_f[SCL_CL_Z_OFFSET(ci_pos) + cii_pos], fiz);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
extern "C"
 | 
			
		||||
double computeForceLJSup_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
 | 
			
		||||
    DEBUG_MESSAGE("computeForceLJSup_cuda start\r\n");
 | 
			
		||||
 | 
			
		||||
    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
 | 
			
		||||
    MD_FLOAT sigma6 = param->sigma6;
 | 
			
		||||
    MD_FLOAT epsilon = param->epsilon;
 | 
			
		||||
 | 
			
		||||
    memsetGPU(cuda_cl_f, 0, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
 | 
			
		||||
    if (isReneighboured) {
 | 
			
		||||
 | 
			
		||||
        for(int ci = 0; ci < atom->Nclusters_local; ci++) {
 | 
			
		||||
            memcpyToGPU(&cuda_numneigh[ci], &neighbor->numneigh[ci], sizeof(int));
 | 
			
		||||
            memcpyToGPU(&cuda_neighbors[ci * neighbor->maxneighs], &neighbor->neighbors[ci * neighbor->maxneighs], neighbor->numneigh[ci] * sizeof(int));
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        for(int sci = 0; sci < atom->Nsclusters_local; sci++) {
 | 
			
		||||
            memcpyToGPU(&cuda_nclusters[sci], &atom->siclusters[sci].nclusters, sizeof(int));
 | 
			
		||||
            //memcpyToGPU(&cuda_iclusters[sci * SCLUSTER_SIZE], &atom->siclusters[sci].iclusters, sizeof(int) * atom->siclusters[sci].nclusters);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        memcpyToGPU(cuda_iclusters, atom->icluster_idx, atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int));
 | 
			
		||||
 | 
			
		||||
        isReneighboured = 0;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    const int threads_num = 1;
 | 
			
		||||
    dim3 block_size = dim3(threads_num, SCLUSTER_M, CLUSTER_N);
 | 
			
		||||
    dim3 grid_size = dim3(atom->Nsclusters_local/threads_num+1, 1, 1);
 | 
			
		||||
    double S = getTimeStamp();
 | 
			
		||||
    LIKWID_MARKER_START("force");
 | 
			
		||||
    computeForceLJSup_cuda_warp<<<grid_size, block_size>>>(cuda_scl_x, cuda_scl_f,
 | 
			
		||||
                                                           cuda_nclusters, cuda_iclusters,
 | 
			
		||||
                                                           atom->Nsclusters_local,
 | 
			
		||||
                                                           cuda_numneigh, cuda_neighbors,
 | 
			
		||||
                                                           neighbor->half_neigh, neighbor->maxneighs, cutforcesq,
 | 
			
		||||
                                                           sigma6, epsilon);
 | 
			
		||||
    cuda_assert("computeForceLJ_cuda", cudaPeekAtLastError());
 | 
			
		||||
    cuda_assert("computeForceLJ_cuda", cudaDeviceSynchronize());
 | 
			
		||||
    LIKWID_MARKER_STOP("force");
 | 
			
		||||
    double E = getTimeStamp();
 | 
			
		||||
    DEBUG_MESSAGE("computeForceLJSup_cuda stop\r\n");
 | 
			
		||||
    return E-S;
 | 
			
		||||
}
 | 
			
		||||
#endif //USE_SUPER_CLUSTERS
 | 
			
		||||
@@ -16,36 +16,10 @@
 | 
			
		||||
#include <simd.h>
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
static inline void gmx_load_simd_2xnn_interactions(
 | 
			
		||||
    int excl,
 | 
			
		||||
    MD_SIMD_BITMASK filter0, MD_SIMD_BITMASK filter2,
 | 
			
		||||
    MD_SIMD_MASK *interact0, MD_SIMD_MASK *interact2) {
 | 
			
		||||
 | 
			
		||||
    //SimdInt32 mask_pr_S(excl);
 | 
			
		||||
    MD_SIMD_INT32 mask_pr_S = simd_int32_broadcast(excl);
 | 
			
		||||
    *interact0 = cvtIB2B(simd_test_bits(mask_pr_S & filter0));
 | 
			
		||||
    *interact2 = cvtIB2B(simd_test_bits(mask_pr_S & filter2));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline void gmx_load_simd_4xn_interactions(
 | 
			
		||||
    int excl,
 | 
			
		||||
    MD_SIMD_BITMASK filter0, MD_SIMD_BITMASK filter1, MD_SIMD_BITMASK filter2, MD_SIMD_BITMASK filter3,
 | 
			
		||||
    MD_SIMD_MASK *interact0, MD_SIMD_MASK *interact1, MD_SIMD_MASK *interact2, MD_SIMD_MASK *interact3) {
 | 
			
		||||
 | 
			
		||||
    //SimdInt32 mask_pr_S(excl);
 | 
			
		||||
    MD_SIMD_INT32 mask_pr_S = simd_int32_broadcast(excl);
 | 
			
		||||
    *interact0 = cvtIB2B(simd_test_bits(mask_pr_S & filter0));
 | 
			
		||||
    *interact1 = cvtIB2B(simd_test_bits(mask_pr_S & filter1));
 | 
			
		||||
    *interact2 = cvtIB2B(simd_test_bits(mask_pr_S & filter2));
 | 
			
		||||
    *interact3 = cvtIB2B(simd_test_bits(mask_pr_S & filter3));
 | 
			
		||||
}
 | 
			
		||||
*/
 | 
			
		||||
 | 
			
		||||
double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
 | 
			
		||||
    DEBUG_MESSAGE("computeForceLJ begin\n");
 | 
			
		||||
    int Nlocal = atom->Nlocal;
 | 
			
		||||
    NeighborCluster* neighs;
 | 
			
		||||
    int* neighs;
 | 
			
		||||
    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
 | 
			
		||||
    MD_FLOAT sigma6 = param->sigma6;
 | 
			
		||||
    MD_FLOAT epsilon = param->epsilon;
 | 
			
		||||
@@ -61,12 +35,9 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    double S = getTimeStamp();
 | 
			
		||||
 | 
			
		||||
    #pragma omp parallel
 | 
			
		||||
    {
 | 
			
		||||
    LIKWID_MARKER_START("force");
 | 
			
		||||
 | 
			
		||||
    #pragma omp for
 | 
			
		||||
    #pragma omp parallel for
 | 
			
		||||
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
 | 
			
		||||
        int ci_cj0 = CJ0_FROM_CI(ci);
 | 
			
		||||
        int ci_cj1 = CJ1_FROM_CI(ci);
 | 
			
		||||
@@ -77,7 +48,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
 | 
			
		||||
        int numneighs = neighbor->numneigh[ci];
 | 
			
		||||
 | 
			
		||||
        for(int k = 0; k < numneighs; k++) {
 | 
			
		||||
            int cj = neighs[k].cj;
 | 
			
		||||
            int cj = neighs[k];
 | 
			
		||||
            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
 | 
			
		||||
            int any = 0;
 | 
			
		||||
            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
 | 
			
		||||
@@ -148,8 +119,6 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    LIKWID_MARKER_STOP("force");
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    double E = getTimeStamp();
 | 
			
		||||
    DEBUG_MESSAGE("computeForceLJ end\n");
 | 
			
		||||
    return E-S;
 | 
			
		||||
@@ -158,7 +127,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
 | 
			
		||||
double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
 | 
			
		||||
    DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
 | 
			
		||||
    int Nlocal = atom->Nlocal;
 | 
			
		||||
    NeighborCluster* neighs;
 | 
			
		||||
    int* neighs;
 | 
			
		||||
    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
 | 
			
		||||
    MD_FLOAT sigma6 = param->sigma6;
 | 
			
		||||
    MD_FLOAT epsilon = param->epsilon;
 | 
			
		||||
@@ -167,6 +136,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
 | 
			
		||||
    MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
 | 
			
		||||
    MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
 | 
			
		||||
    MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
 | 
			
		||||
    const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
 | 
			
		||||
 | 
			
		||||
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
 | 
			
		||||
        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
 | 
			
		||||
@@ -179,41 +149,9 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    double S = getTimeStamp();
 | 
			
		||||
 | 
			
		||||
    #pragma omp parallel
 | 
			
		||||
    {
 | 
			
		||||
    LIKWID_MARKER_START("force");
 | 
			
		||||
 | 
			
		||||
    /*
 | 
			
		||||
    MD_SIMD_BITMASK filter0 = simd_load_bitmask((const int *) &atom->exclusion_filter[0 * (VECTOR_WIDTH / UNROLL_J)]);
 | 
			
		||||
    MD_SIMD_BITMASK filter2 = simd_load_bitmask((const int *) &atom->exclusion_filter[2 * (VECTOR_WIDTH / UNROLL_J)]);
 | 
			
		||||
 | 
			
		||||
    MD_SIMD_FLOAT diagonal_jmi_S = simd_load(atom->diagonal_2xnn_j_minus_i);
 | 
			
		||||
    MD_SIMD_FLOAT zero_S = simd_broadcast(0.0);
 | 
			
		||||
    MD_SIMD_FLOAT one_S = simd_broadcast(1.0);
 | 
			
		||||
 | 
			
		||||
    #if CLUSTER_M <= CLUSTER_N
 | 
			
		||||
    MD_SIMD_MASK diagonal_mask0, diagonal_mask2;
 | 
			
		||||
    diagonal_mask0 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
 | 
			
		||||
    diagonal_jmi_S = diagonal_jmi_S - one_S;
 | 
			
		||||
    diagonal_jmi_S = diagonal_jmi_S - one_S;
 | 
			
		||||
    diagonal_mask2 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
 | 
			
		||||
    #else
 | 
			
		||||
    MD_SIMD_MASK diagonal_mask00, diagonal_mask02, diagonal_mask10, diagonal_mask12;
 | 
			
		||||
    diagonal_mask00 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
 | 
			
		||||
    diagonal_jmi_S = diagonal_jmi_S - one_S;
 | 
			
		||||
    diagonal_jmi_S = diagonal_jmi_S - one_S;
 | 
			
		||||
    diagonal_mask02 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
 | 
			
		||||
    diagonal_jmi_S = diagonal_jmi_S - one_S;
 | 
			
		||||
    diagonal_jmi_S = diagonal_jmi_S - one_S;
 | 
			
		||||
    diagonal_mask10 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
 | 
			
		||||
    diagonal_jmi_S = diagonal_jmi_S - one_S;
 | 
			
		||||
    diagonal_jmi_S = diagonal_jmi_S - one_S;
 | 
			
		||||
    diagonal_mask12 = simd_mask_cond_lt(zero_S, diagonal_jmi_S);
 | 
			
		||||
    #endif
 | 
			
		||||
    */
 | 
			
		||||
 | 
			
		||||
    #pragma omp for
 | 
			
		||||
    #pragma omp parallel for
 | 
			
		||||
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
 | 
			
		||||
        int ci_cj0 = CJ0_FROM_CI(ci);
 | 
			
		||||
        #if CLUSTER_M > CLUSTER_N
 | 
			
		||||
@@ -224,7 +162,6 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
 | 
			
		||||
        MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
 | 
			
		||||
        neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
 | 
			
		||||
        int numneighs = neighbor->numneigh[ci];
 | 
			
		||||
        int numneighs_masked = neighbor->numneigh_masked[ci];
 | 
			
		||||
 | 
			
		||||
        MD_SIMD_FLOAT xi0_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 0]);
 | 
			
		||||
        MD_SIMD_FLOAT xi2_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 2]);
 | 
			
		||||
@@ -239,138 +176,76 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
 | 
			
		||||
        MD_SIMD_FLOAT fiy2 = simd_zero();
 | 
			
		||||
        MD_SIMD_FLOAT fiz2 = simd_zero();
 | 
			
		||||
 | 
			
		||||
        for(int k = 0; k < numneighs_masked; k++) {
 | 
			
		||||
            int cj = neighs[k].cj;
 | 
			
		||||
        for(int k = 0; k < numneighs; k++) {
 | 
			
		||||
            int cj = neighs[k];
 | 
			
		||||
            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
 | 
			
		||||
            //int imask = neighs[k].imask;
 | 
			
		||||
            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
 | 
			
		||||
            MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
 | 
			
		||||
            //MD_SIMD_MASK interact0;
 | 
			
		||||
            //MD_SIMD_MASK interact2;
 | 
			
		||||
 | 
			
		||||
            //gmx_load_simd_2xnn_interactions((int)imask, filter0, filter2, &interact0, &interact2);
 | 
			
		||||
            unsigned int mask0, mask1, mask2, mask3;
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
 | 
			
		||||
            MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
 | 
			
		||||
            MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
 | 
			
		||||
            MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
 | 
			
		||||
            MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
 | 
			
		||||
            MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
 | 
			
		||||
            MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
 | 
			
		||||
            MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
 | 
			
		||||
            MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
 | 
			
		||||
            MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
 | 
			
		||||
            MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
 | 
			
		||||
 | 
			
		||||
            #if CLUSTER_M == CLUSTER_N
 | 
			
		||||
            unsigned int cond0 = (unsigned int)(cj == ci_cj0);
 | 
			
		||||
            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_2xnn_hn[cond0 * 2 + 0]);
 | 
			
		||||
            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_2xnn_hn[cond0 * 2 + 1]);
 | 
			
		||||
            #else
 | 
			
		||||
            #if CLUSTER_M < CLUSTER_N
 | 
			
		||||
            mask0 = (unsigned int)(0xf - 0x1 * cond0);
 | 
			
		||||
            mask1 = (unsigned int)(0xf - 0x3 * cond0);
 | 
			
		||||
            mask2 = (unsigned int)(0xf - 0x7 * cond0);
 | 
			
		||||
            mask3 = (unsigned int)(0xf - 0xf * cond0);
 | 
			
		||||
            #elif CLUSTER_M < CLUSTER_N
 | 
			
		||||
            unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
 | 
			
		||||
            unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
 | 
			
		||||
            mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1);
 | 
			
		||||
            mask1 = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1);
 | 
			
		||||
            mask2 = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1);
 | 
			
		||||
            mask3 = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1);
 | 
			
		||||
            #else
 | 
			
		||||
            unsigned int cond0 = (unsigned int)(cj == ci_cj0);
 | 
			
		||||
            unsigned int cond1 = (unsigned int)(cj == ci_cj1);
 | 
			
		||||
            #endif
 | 
			
		||||
            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 0]);
 | 
			
		||||
            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_2xnn_hn[cond0 * 4 + cond1 * 2 + 1]);
 | 
			
		||||
            mask0 = (unsigned int)(0x3 - 0x1 * cond0);
 | 
			
		||||
            mask1 = (unsigned int)(0x3 - 0x3 * cond0);
 | 
			
		||||
            mask2 = (unsigned int)(0x3 - cond0 * 0x3 - 0x1 * cond1);
 | 
			
		||||
            mask3 = (unsigned int)(0x3 - cond0 * 0x3 - 0x3 * cond1);
 | 
			
		||||
            #endif
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec);
 | 
			
		||||
            MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec);
 | 
			
		||||
            cutoff_mask0 = simd_mask_and(cutoff_mask0, excl_mask0);
 | 
			
		||||
            cutoff_mask2 = simd_mask_and(cutoff_mask2, excl_mask2);
 | 
			
		||||
            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0);
 | 
			
		||||
            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2);
 | 
			
		||||
 | 
			
		||||
            /*
 | 
			
		||||
            #if CLUSTER_M <= CLUSTER_N
 | 
			
		||||
            if(ci == ci_cj0) {
 | 
			
		||||
                cutoff_mask0 = simd_mask_and(cutoff_mask0, diagonal_mask0);
 | 
			
		||||
                cutoff_mask2 = simd_mask_and(cutoff_mask2, diagonal_mask2);
 | 
			
		||||
            }
 | 
			
		||||
            #else
 | 
			
		||||
            if(ci == ci_cj0) {
 | 
			
		||||
                cutoff_mask0 = cutoff_mask0 && diagonal_mask00;
 | 
			
		||||
                cutoff_mask2 = cutoff_mask2 && diagonal_mask02;
 | 
			
		||||
            } else if(ci == ci_cj1) {
 | 
			
		||||
                cutoff_mask0 = cutoff_mask0 && diagonal_mask10;
 | 
			
		||||
                cutoff_mask2 = cutoff_mask2 && diagonal_mask12;
 | 
			
		||||
            }
 | 
			
		||||
            #endif
 | 
			
		||||
            */
 | 
			
		||||
            MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
 | 
			
		||||
            MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
 | 
			
		||||
            MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
 | 
			
		||||
            MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
 | 
			
		||||
            MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
 | 
			
		||||
            MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
 | 
			
		||||
            MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
 | 
			
		||||
            MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_FLOAT tx0 = select_by_mask(delx0 * force0, cutoff_mask0);
 | 
			
		||||
            MD_SIMD_FLOAT ty0 = select_by_mask(dely0 * force0, cutoff_mask0);
 | 
			
		||||
            MD_SIMD_FLOAT tz0 = select_by_mask(delz0 * force0, cutoff_mask0);
 | 
			
		||||
            MD_SIMD_FLOAT tx2 = select_by_mask(delx2 * force2, cutoff_mask2);
 | 
			
		||||
            MD_SIMD_FLOAT ty2 = select_by_mask(dely2 * force2, cutoff_mask2);
 | 
			
		||||
            MD_SIMD_FLOAT tz2 = select_by_mask(delz2 * force2, cutoff_mask2);
 | 
			
		||||
            MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
 | 
			
		||||
            MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
 | 
			
		||||
 | 
			
		||||
            fix0 += tx0;
 | 
			
		||||
            fiy0 += ty0;
 | 
			
		||||
            fiz0 += tz0;
 | 
			
		||||
            fix2 += tx2;
 | 
			
		||||
            fiy2 += ty2;
 | 
			
		||||
            fiz2 += tz2;
 | 
			
		||||
            MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
 | 
			
		||||
            MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
 | 
			
		||||
 | 
			
		||||
            #ifdef HALF_NEIGHBOR_LISTS_CHECK_CJ
 | 
			
		||||
            if(cj < CJ1_FROM_CI(atom->Nlocal)) {
 | 
			
		||||
                simd_h_decr3(cj_f, tx0 + tx2, ty0 + ty2, tz0 + tz2);
 | 
			
		||||
            }
 | 
			
		||||
            #else
 | 
			
		||||
            simd_h_decr3(cj_f, tx0 + tx2, ty0 + ty2, tz0 + tz2);
 | 
			
		||||
            #endif
 | 
			
		||||
        }
 | 
			
		||||
            MD_SIMD_FLOAT tx0 = select_by_mask(simd_mul(delx0, force0), cutoff_mask0);
 | 
			
		||||
            MD_SIMD_FLOAT ty0 = select_by_mask(simd_mul(dely0, force0), cutoff_mask0);
 | 
			
		||||
            MD_SIMD_FLOAT tz0 = select_by_mask(simd_mul(delz0, force0), cutoff_mask0);
 | 
			
		||||
            MD_SIMD_FLOAT tx2 = select_by_mask(simd_mul(delx2, force2), cutoff_mask2);
 | 
			
		||||
            MD_SIMD_FLOAT ty2 = select_by_mask(simd_mul(dely2, force2), cutoff_mask2);
 | 
			
		||||
            MD_SIMD_FLOAT tz2 = select_by_mask(simd_mul(delz2, force2), cutoff_mask2);
 | 
			
		||||
 | 
			
		||||
        for(int k = numneighs_masked; k < numneighs; k++) {
 | 
			
		||||
            int cj = neighs[k].cj;
 | 
			
		||||
            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
 | 
			
		||||
            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
 | 
			
		||||
            MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
 | 
			
		||||
            MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
 | 
			
		||||
            MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
 | 
			
		||||
            MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
 | 
			
		||||
            MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec);
 | 
			
		||||
            MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec);
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
 | 
			
		||||
            MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
 | 
			
		||||
            MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
 | 
			
		||||
            MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
 | 
			
		||||
            MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
 | 
			
		||||
            MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_FLOAT tx0 = select_by_mask(delx0 * force0, cutoff_mask0);
 | 
			
		||||
            MD_SIMD_FLOAT ty0 = select_by_mask(dely0 * force0, cutoff_mask0);
 | 
			
		||||
            MD_SIMD_FLOAT tz0 = select_by_mask(delz0 * force0, cutoff_mask0);
 | 
			
		||||
            MD_SIMD_FLOAT tx2 = select_by_mask(delx2 * force2, cutoff_mask2);
 | 
			
		||||
            MD_SIMD_FLOAT ty2 = select_by_mask(dely2 * force2, cutoff_mask2);
 | 
			
		||||
            MD_SIMD_FLOAT tz2 = select_by_mask(delz2 * force2, cutoff_mask2);
 | 
			
		||||
 | 
			
		||||
            fix0 += tx0;
 | 
			
		||||
            fiy0 += ty0;
 | 
			
		||||
            fiz0 += tz0;
 | 
			
		||||
            fix2 += tx2;
 | 
			
		||||
            fiy2 += ty2;
 | 
			
		||||
            fiz2 += tz2;
 | 
			
		||||
            fix0 = simd_add(fix0, tx0);
 | 
			
		||||
            fiy0 = simd_add(fiy0, ty0);
 | 
			
		||||
            fiz0 = simd_add(fiz0, tz0);
 | 
			
		||||
            fix2 = simd_add(fix2, tx2);
 | 
			
		||||
            fiy2 = simd_add(fiy2, ty2);
 | 
			
		||||
            fiz2 = simd_add(fiz2, tz2);
 | 
			
		||||
 | 
			
		||||
            #ifdef HALF_NEIGHBOR_LISTS_CHECK_CJ
 | 
			
		||||
            if(cj < CJ1_FROM_CI(atom->Nlocal)) {
 | 
			
		||||
@@ -391,8 +266,6 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    LIKWID_MARKER_STOP("force");
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    double E = getTimeStamp();
 | 
			
		||||
    DEBUG_MESSAGE("computeForceLJ_2xnn end\n");
 | 
			
		||||
    return E-S;
 | 
			
		||||
@@ -401,7 +274,7 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor
 | 
			
		||||
double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
 | 
			
		||||
    DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
 | 
			
		||||
    int Nlocal = atom->Nlocal;
 | 
			
		||||
    NeighborCluster* neighs;
 | 
			
		||||
    int* neighs;
 | 
			
		||||
    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
 | 
			
		||||
    MD_FLOAT sigma6 = param->sigma6;
 | 
			
		||||
    MD_FLOAT epsilon = param->epsilon;
 | 
			
		||||
@@ -410,6 +283,7 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
 | 
			
		||||
    MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
 | 
			
		||||
    MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
 | 
			
		||||
    MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
 | 
			
		||||
    const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
 | 
			
		||||
 | 
			
		||||
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
 | 
			
		||||
        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
 | 
			
		||||
@@ -422,12 +296,9 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    double S = getTimeStamp();
 | 
			
		||||
 | 
			
		||||
    #pragma omp parallel
 | 
			
		||||
    {
 | 
			
		||||
    LIKWID_MARKER_START("force");
 | 
			
		||||
 | 
			
		||||
    #pragma omp for
 | 
			
		||||
    #pragma omp parallel for
 | 
			
		||||
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
 | 
			
		||||
        int ci_cj0 = CJ0_FROM_CI(ci);
 | 
			
		||||
        #if CLUSTER_M > CLUSTER_N
 | 
			
		||||
@@ -438,7 +309,6 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
 | 
			
		||||
        MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
 | 
			
		||||
        neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
 | 
			
		||||
        int numneighs = neighbor->numneigh[ci];
 | 
			
		||||
        int numneighs_masked = neighbor->numneigh_masked[ci];
 | 
			
		||||
 | 
			
		||||
        MD_SIMD_FLOAT xi0_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 0]);
 | 
			
		||||
        MD_SIMD_FLOAT xi2_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 2]);
 | 
			
		||||
@@ -453,85 +323,61 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
 | 
			
		||||
        MD_SIMD_FLOAT fiy2 = simd_zero();
 | 
			
		||||
        MD_SIMD_FLOAT fiz2 = simd_zero();
 | 
			
		||||
 | 
			
		||||
        for(int k = 0; k < numneighs_masked; k++) {
 | 
			
		||||
            int cj = neighs[k].cj;
 | 
			
		||||
        for(int k = 0; k < numneighs; k++) {
 | 
			
		||||
            int cj = neighs[k];
 | 
			
		||||
            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
 | 
			
		||||
            int imask = neighs[k].imask;
 | 
			
		||||
            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
 | 
			
		||||
            unsigned int mask0, mask1, mask2, mask3;
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
 | 
			
		||||
            MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
 | 
			
		||||
            MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
 | 
			
		||||
            MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
 | 
			
		||||
            MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
 | 
			
		||||
            MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
 | 
			
		||||
            MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
 | 
			
		||||
            MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
 | 
			
		||||
            MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
 | 
			
		||||
            MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
 | 
			
		||||
            MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
 | 
			
		||||
 | 
			
		||||
            #if CLUSTER_M == CLUSTER_N
 | 
			
		||||
            unsigned int cond0 = (unsigned int)(cj == ci_cj0);
 | 
			
		||||
            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_2xnn_fn[cond0 * 2 + 0]);
 | 
			
		||||
            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_2xnn_fn[cond0 * 2 + 1]);
 | 
			
		||||
            #else
 | 
			
		||||
            #if CLUSTER_M < CLUSTER_N
 | 
			
		||||
            mask0 = (unsigned int)(0xf - 0x1 * cond0);
 | 
			
		||||
            mask1 = (unsigned int)(0xf - 0x2 * cond0);
 | 
			
		||||
            mask2 = (unsigned int)(0xf - 0x4 * cond0);
 | 
			
		||||
            mask3 = (unsigned int)(0xf - 0x8 * cond0);
 | 
			
		||||
            #elif CLUSTER_M < CLUSTER_N
 | 
			
		||||
            unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
 | 
			
		||||
            unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
 | 
			
		||||
            mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
 | 
			
		||||
            mask1 = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
 | 
			
		||||
            mask2 = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
 | 
			
		||||
            mask3 = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
 | 
			
		||||
            #else
 | 
			
		||||
            unsigned int cond0 = (unsigned int)(cj == ci_cj0);
 | 
			
		||||
            unsigned int cond1 = (unsigned int)(cj == ci_cj1);
 | 
			
		||||
            mask0 = (unsigned int)(0x3 - 0x1 * cond0);
 | 
			
		||||
            mask1 = (unsigned int)(0x3 - 0x2 * cond0);
 | 
			
		||||
            mask2 = (unsigned int)(0x3 - 0x1 * cond1);
 | 
			
		||||
            mask3 = (unsigned int)(0x3 - 0x2 * cond1);
 | 
			
		||||
            #endif
 | 
			
		||||
            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 0]);
 | 
			
		||||
            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_2xnn_fn[cond0 * 4 + cond1 * 2 + 1]);
 | 
			
		||||
            #endif
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0);
 | 
			
		||||
            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2);
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
 | 
			
		||||
            MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
 | 
			
		||||
            MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
 | 
			
		||||
            MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
 | 
			
		||||
            MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
 | 
			
		||||
            MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
 | 
			
		||||
            MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
 | 
			
		||||
            MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
 | 
			
		||||
 | 
			
		||||
            fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
 | 
			
		||||
            fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
 | 
			
		||||
            fiz0 = simd_masked_add(fiz0, simd_mul(delz0, force0), cutoff_mask0);
 | 
			
		||||
            fix2 = simd_masked_add(fix2, simd_mul(delx2, force2), cutoff_mask2);
 | 
			
		||||
            fiy2 = simd_masked_add(fiy2, simd_mul(dely2, force2), cutoff_mask2);
 | 
			
		||||
            fiz2 = simd_masked_add(fiz2, simd_mul(delz2, force2), cutoff_mask2);
 | 
			
		||||
        }
 | 
			
		||||
            MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
 | 
			
		||||
            MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
 | 
			
		||||
 | 
			
		||||
        for(int k = numneighs_masked; k < numneighs; k++) {
 | 
			
		||||
            int cj = neighs[k].cj;
 | 
			
		||||
            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
 | 
			
		||||
            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
 | 
			
		||||
            MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
 | 
			
		||||
            MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
 | 
			
		||||
            MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
 | 
			
		||||
            MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec);
 | 
			
		||||
            MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec);
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
 | 
			
		||||
            MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
 | 
			
		||||
            MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
 | 
			
		||||
            MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
 | 
			
		||||
            MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
 | 
			
		||||
            MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
 | 
			
		||||
            MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
 | 
			
		||||
            MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
 | 
			
		||||
 | 
			
		||||
            fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
 | 
			
		||||
            fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
 | 
			
		||||
@@ -552,8 +398,6 @@ double computeForceLJ_2xnn_full(Parameter *param, Atom *atom, Neighbor *neighbor
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    LIKWID_MARKER_STOP("force");
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    double E = getTimeStamp();
 | 
			
		||||
    DEBUG_MESSAGE("computeForceLJ_2xnn end\n");
 | 
			
		||||
    return E-S;
 | 
			
		||||
@@ -570,7 +414,7 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta
 | 
			
		||||
double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
 | 
			
		||||
    DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
 | 
			
		||||
    int Nlocal = atom->Nlocal;
 | 
			
		||||
    NeighborCluster* neighs;
 | 
			
		||||
    int* neighs;
 | 
			
		||||
    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
 | 
			
		||||
    MD_FLOAT sigma6 = param->sigma6;
 | 
			
		||||
    MD_FLOAT epsilon = param->epsilon;
 | 
			
		||||
@@ -579,6 +423,8 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
 | 
			
		||||
    MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
 | 
			
		||||
    MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
 | 
			
		||||
    MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
 | 
			
		||||
    double S = getTimeStamp();
 | 
			
		||||
    LIKWID_MARKER_START("force");
 | 
			
		||||
 | 
			
		||||
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
 | 
			
		||||
        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
 | 
			
		||||
@@ -590,13 +436,7 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    double S = getTimeStamp();
 | 
			
		||||
 | 
			
		||||
    #pragma omp parallel
 | 
			
		||||
    {
 | 
			
		||||
    LIKWID_MARKER_START("force");
 | 
			
		||||
 | 
			
		||||
    #pragma omp for
 | 
			
		||||
    #pragma omp parallel for
 | 
			
		||||
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
 | 
			
		||||
        int ci_cj0 = CJ0_FROM_CI(ci);
 | 
			
		||||
        #if CLUSTER_M > CLUSTER_N
 | 
			
		||||
@@ -607,7 +447,6 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
 | 
			
		||||
        MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
 | 
			
		||||
        neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
 | 
			
		||||
        int numneighs = neighbor->numneigh[ci];
 | 
			
		||||
        int numneighs_masked = neighbor->numneigh_masked[ci];
 | 
			
		||||
 | 
			
		||||
        MD_SIMD_FLOAT xi0_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 0]);
 | 
			
		||||
        MD_SIMD_FLOAT xi1_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 1]);
 | 
			
		||||
@@ -634,52 +473,53 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
 | 
			
		||||
        MD_SIMD_FLOAT fiy3 = simd_zero();
 | 
			
		||||
        MD_SIMD_FLOAT fiz3 = simd_zero();
 | 
			
		||||
 | 
			
		||||
        for(int k = 0; k < numneighs_masked; k++) {
 | 
			
		||||
            int cj = neighs[k].cj;
 | 
			
		||||
        for(int k = 0; k < numneighs; k++) {
 | 
			
		||||
            int cj = neighs[k];
 | 
			
		||||
            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
 | 
			
		||||
            int imask = neighs[k].imask;
 | 
			
		||||
            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
 | 
			
		||||
            MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
 | 
			
		||||
            MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
 | 
			
		||||
            MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
 | 
			
		||||
            MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
 | 
			
		||||
            MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delx1 = xi1_tmp - xj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT dely1 = yi1_tmp - yj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delz1 = zi1_tmp - zj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delx3 = xi3_tmp - xj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT dely3 = yi3_tmp - yj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delz3 = zi3_tmp - zj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
 | 
			
		||||
            MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
 | 
			
		||||
            MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
 | 
			
		||||
            MD_SIMD_FLOAT delx1 = simd_sub(xi1_tmp, xj_tmp);
 | 
			
		||||
            MD_SIMD_FLOAT dely1 = simd_sub(yi1_tmp, yj_tmp);
 | 
			
		||||
            MD_SIMD_FLOAT delz1 = simd_sub(zi1_tmp, zj_tmp);
 | 
			
		||||
            MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
 | 
			
		||||
            MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
 | 
			
		||||
            MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
 | 
			
		||||
            MD_SIMD_FLOAT delx3 = simd_sub(xi3_tmp, xj_tmp);
 | 
			
		||||
            MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp);
 | 
			
		||||
            MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp);
 | 
			
		||||
 | 
			
		||||
            #if CLUSTER_M == CLUSTER_N
 | 
			
		||||
            unsigned int cond0 = (unsigned int)(cj == ci_cj0);
 | 
			
		||||
            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 4 + 0]);
 | 
			
		||||
            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 4 + 1]);
 | 
			
		||||
            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 4 + 2]);
 | 
			
		||||
            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 4 + 3]);
 | 
			
		||||
            #else
 | 
			
		||||
            #if CLUSTER_M < CLUSTER_N
 | 
			
		||||
            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
 | 
			
		||||
            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x3 * cond0));
 | 
			
		||||
            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x7 * cond0));
 | 
			
		||||
            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0xf * cond0));
 | 
			
		||||
            #elif CLUSTER_M < CLUSTER_N
 | 
			
		||||
            unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
 | 
			
		||||
            unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
 | 
			
		||||
            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1));
 | 
			
		||||
            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1));
 | 
			
		||||
            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1));
 | 
			
		||||
            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0xf * cond0 - 0xff * cond1));
 | 
			
		||||
            #else
 | 
			
		||||
            unsigned int cond0 = (unsigned int)(cj == ci_cj0);
 | 
			
		||||
            unsigned int cond1 = (unsigned int)(cj == ci_cj1);
 | 
			
		||||
            #endif
 | 
			
		||||
            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 0]);
 | 
			
		||||
            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 1]);
 | 
			
		||||
            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 2]);
 | 
			
		||||
            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32(atom->masks_4xn_hn[cond0 * 8 + cond1 * 4 + 3]);
 | 
			
		||||
            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond0));
 | 
			
		||||
            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0x3 - 0x3 * cond0));
 | 
			
		||||
            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0x3 - 0x3 * cond0 - 0x1 * cond1));
 | 
			
		||||
            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0x3 - 0x3 * cond0 - 0x3 * cond1));
 | 
			
		||||
            #endif
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
 | 
			
		||||
            MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, delz1 * delz1));
 | 
			
		||||
            MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
 | 
			
		||||
            MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, delz3 * delz3));
 | 
			
		||||
            MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
 | 
			
		||||
            MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, simd_mul(delz1, delz1)));
 | 
			
		||||
            MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
 | 
			
		||||
            MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, simd_mul(delz3, delz3)));
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
 | 
			
		||||
            MD_SIMD_MASK cutoff_mask1 = simd_mask_and(excl_mask1, simd_mask_cond_lt(rsq1, cutforcesq_vec));
 | 
			
		||||
@@ -691,114 +531,28 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
 | 
			
		||||
            MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
 | 
			
		||||
            MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
 | 
			
		||||
            MD_SIMD_FLOAT sr6_1 = sr2_1 * sr2_1 * sr2_1 * sigma6_vec;
 | 
			
		||||
            MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
 | 
			
		||||
            MD_SIMD_FLOAT sr6_3 = sr2_3 * sr2_3 * sr2_3 * sigma6_vec;
 | 
			
		||||
            MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
 | 
			
		||||
            MD_SIMD_FLOAT sr6_1 = simd_mul(sr2_1, simd_mul(sr2_1, simd_mul(sr2_1, sigma6_vec)));
 | 
			
		||||
            MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
 | 
			
		||||
            MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec)));
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
 | 
			
		||||
            MD_SIMD_FLOAT force1 = c48_vec * sr6_1 * (sr6_1 - c05_vec) * sr2_1 * eps_vec;
 | 
			
		||||
            MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
 | 
			
		||||
            MD_SIMD_FLOAT force3 = c48_vec * sr6_3 * (sr6_3 - c05_vec) * sr2_3 * eps_vec;
 | 
			
		||||
            MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
 | 
			
		||||
            MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, eps_vec))));
 | 
			
		||||
            MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
 | 
			
		||||
            MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, eps_vec))));
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_FLOAT tx0 = select_by_mask(delx0 * force0, cutoff_mask0);
 | 
			
		||||
            MD_SIMD_FLOAT ty0 = select_by_mask(dely0 * force0, cutoff_mask0);
 | 
			
		||||
            MD_SIMD_FLOAT tz0 = select_by_mask(delz0 * force0, cutoff_mask0);
 | 
			
		||||
            MD_SIMD_FLOAT tx1 = select_by_mask(delx1 * force1, cutoff_mask1);
 | 
			
		||||
            MD_SIMD_FLOAT ty1 = select_by_mask(dely1 * force1, cutoff_mask1);
 | 
			
		||||
            MD_SIMD_FLOAT tz1 = select_by_mask(delz1 * force1, cutoff_mask1);
 | 
			
		||||
            MD_SIMD_FLOAT tx2 = select_by_mask(delx2 * force2, cutoff_mask2);
 | 
			
		||||
            MD_SIMD_FLOAT ty2 = select_by_mask(dely2 * force2, cutoff_mask2);
 | 
			
		||||
            MD_SIMD_FLOAT tz2 = select_by_mask(delz2 * force2, cutoff_mask2);
 | 
			
		||||
            MD_SIMD_FLOAT tx3 = select_by_mask(delx3 * force3, cutoff_mask3);
 | 
			
		||||
            MD_SIMD_FLOAT ty3 = select_by_mask(dely3 * force3, cutoff_mask3);
 | 
			
		||||
            MD_SIMD_FLOAT tz3 = select_by_mask(delz3 * force3, cutoff_mask3);
 | 
			
		||||
 | 
			
		||||
            fix0 = simd_add(fix0, tx0);
 | 
			
		||||
            fiy0 = simd_add(fiy0, ty0);
 | 
			
		||||
            fiz0 = simd_add(fiz0, tz0);
 | 
			
		||||
            fix1 = simd_add(fix1, tx1);
 | 
			
		||||
            fiy1 = simd_add(fiy1, ty1);
 | 
			
		||||
            fiz1 = simd_add(fiz1, tz1);
 | 
			
		||||
            fix2 = simd_add(fix2, tx2);
 | 
			
		||||
            fiy2 = simd_add(fiy2, ty2);
 | 
			
		||||
            fiz2 = simd_add(fiz2, tz2);
 | 
			
		||||
            fix3 = simd_add(fix3, tx3);
 | 
			
		||||
            fiy3 = simd_add(fiy3, ty3);
 | 
			
		||||
            fiz3 = simd_add(fiz3, tz3);
 | 
			
		||||
 | 
			
		||||
            #ifdef HALF_NEIGHBOR_LISTS_CHECK_CJ
 | 
			
		||||
            if(cj < CJ1_FROM_CI(atom->Nlocal)) {
 | 
			
		||||
                simd_store(&cj_f[CL_X_OFFSET], simd_load(&cj_f[CL_X_OFFSET]) - (tx0 + tx1 + tx2 + tx3));
 | 
			
		||||
                simd_store(&cj_f[CL_Y_OFFSET], simd_load(&cj_f[CL_Y_OFFSET]) - (ty0 + ty1 + ty2 + ty3));
 | 
			
		||||
                simd_store(&cj_f[CL_Z_OFFSET], simd_load(&cj_f[CL_Z_OFFSET]) - (tz0 + tz1 + tz2 + tz3));
 | 
			
		||||
            }
 | 
			
		||||
            #else
 | 
			
		||||
            simd_store(&cj_f[CL_X_OFFSET], simd_load(&cj_f[CL_X_OFFSET]) - (tx0 + tx1 + tx2 + tx3));
 | 
			
		||||
            simd_store(&cj_f[CL_Y_OFFSET], simd_load(&cj_f[CL_Y_OFFSET]) - (ty0 + ty1 + ty2 + ty3));
 | 
			
		||||
            simd_store(&cj_f[CL_Z_OFFSET], simd_load(&cj_f[CL_Z_OFFSET]) - (tz0 + tz1 + tz2 + tz3));
 | 
			
		||||
            #endif
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        for(int k = numneighs_masked; k < numneighs; k++) {
 | 
			
		||||
            int cj = neighs[k].cj;
 | 
			
		||||
            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
 | 
			
		||||
            int imask = neighs[k].imask;
 | 
			
		||||
            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
 | 
			
		||||
            MD_FLOAT *cj_f = &atom->cl_f[cj_vec_base];
 | 
			
		||||
            MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
 | 
			
		||||
            MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
 | 
			
		||||
            MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
 | 
			
		||||
            MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delx1 = xi1_tmp - xj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT dely1 = yi1_tmp - yj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delz1 = zi1_tmp - zj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delx3 = xi3_tmp - xj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT dely3 = yi3_tmp - yj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delz3 = zi3_tmp - zj_tmp;
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
 | 
			
		||||
            MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, delz1 * delz1));
 | 
			
		||||
            MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
 | 
			
		||||
            MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, delz3 * delz3));
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec);
 | 
			
		||||
            MD_SIMD_MASK cutoff_mask1 = simd_mask_cond_lt(rsq1, cutforcesq_vec);
 | 
			
		||||
            MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec);
 | 
			
		||||
            MD_SIMD_MASK cutoff_mask3 = simd_mask_cond_lt(rsq3, cutforcesq_vec);
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
 | 
			
		||||
            MD_SIMD_FLOAT sr2_1 = simd_reciprocal(rsq1);
 | 
			
		||||
            MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
 | 
			
		||||
            MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
 | 
			
		||||
            MD_SIMD_FLOAT sr6_1 = sr2_1 * sr2_1 * sr2_1 * sigma6_vec;
 | 
			
		||||
            MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
 | 
			
		||||
            MD_SIMD_FLOAT sr6_3 = sr2_3 * sr2_3 * sr2_3 * sigma6_vec;
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
 | 
			
		||||
            MD_SIMD_FLOAT force1 = c48_vec * sr6_1 * (sr6_1 - c05_vec) * sr2_1 * eps_vec;
 | 
			
		||||
            MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
 | 
			
		||||
            MD_SIMD_FLOAT force3 = c48_vec * sr6_3 * (sr6_3 - c05_vec) * sr2_3 * eps_vec;
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_FLOAT tx0 = select_by_mask(delx0 * force0, cutoff_mask0);
 | 
			
		||||
            MD_SIMD_FLOAT ty0 = select_by_mask(dely0 * force0, cutoff_mask0);
 | 
			
		||||
            MD_SIMD_FLOAT tz0 = select_by_mask(delz0 * force0, cutoff_mask0);
 | 
			
		||||
            MD_SIMD_FLOAT tx1 = select_by_mask(delx1 * force1, cutoff_mask1);
 | 
			
		||||
            MD_SIMD_FLOAT ty1 = select_by_mask(dely1 * force1, cutoff_mask1);
 | 
			
		||||
            MD_SIMD_FLOAT tz1 = select_by_mask(delz1 * force1, cutoff_mask1);
 | 
			
		||||
            MD_SIMD_FLOAT tx2 = select_by_mask(delx2 * force2, cutoff_mask2);
 | 
			
		||||
            MD_SIMD_FLOAT ty2 = select_by_mask(dely2 * force2, cutoff_mask2);
 | 
			
		||||
            MD_SIMD_FLOAT tz2 = select_by_mask(delz2 * force2, cutoff_mask2);
 | 
			
		||||
            MD_SIMD_FLOAT tx3 = select_by_mask(delx3 * force3, cutoff_mask3);
 | 
			
		||||
            MD_SIMD_FLOAT ty3 = select_by_mask(dely3 * force3, cutoff_mask3);
 | 
			
		||||
            MD_SIMD_FLOAT tz3 = select_by_mask(delz3 * force3, cutoff_mask3);
 | 
			
		||||
            MD_SIMD_FLOAT tx0 = select_by_mask(simd_mul(delx0, force0), cutoff_mask0);
 | 
			
		||||
            MD_SIMD_FLOAT ty0 = select_by_mask(simd_mul(dely0, force0), cutoff_mask0);
 | 
			
		||||
            MD_SIMD_FLOAT tz0 = select_by_mask(simd_mul(delz0, force0), cutoff_mask0);
 | 
			
		||||
            MD_SIMD_FLOAT tx1 = select_by_mask(simd_mul(delx1, force1), cutoff_mask1);
 | 
			
		||||
            MD_SIMD_FLOAT ty1 = select_by_mask(simd_mul(dely1, force1), cutoff_mask1);
 | 
			
		||||
            MD_SIMD_FLOAT tz1 = select_by_mask(simd_mul(delz1, force1), cutoff_mask1);
 | 
			
		||||
            MD_SIMD_FLOAT tx2 = select_by_mask(simd_mul(delx2, force2), cutoff_mask2);
 | 
			
		||||
            MD_SIMD_FLOAT ty2 = select_by_mask(simd_mul(dely2, force2), cutoff_mask2);
 | 
			
		||||
            MD_SIMD_FLOAT tz2 = select_by_mask(simd_mul(delz2, force2), cutoff_mask2);
 | 
			
		||||
            MD_SIMD_FLOAT tx3 = select_by_mask(simd_mul(delx3, force3), cutoff_mask3);
 | 
			
		||||
            MD_SIMD_FLOAT ty3 = select_by_mask(simd_mul(dely3, force3), cutoff_mask3);
 | 
			
		||||
            MD_SIMD_FLOAT tz3 = select_by_mask(simd_mul(delz3, force3), cutoff_mask3);
 | 
			
		||||
 | 
			
		||||
            fix0 = simd_add(fix0, tx0);
 | 
			
		||||
            fiy0 = simd_add(fiy0, ty0);
 | 
			
		||||
@@ -836,8 +590,6 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    LIKWID_MARKER_STOP("force");
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    double E = getTimeStamp();
 | 
			
		||||
    DEBUG_MESSAGE("computeForceLJ_4xn end\n");
 | 
			
		||||
    return E-S;
 | 
			
		||||
@@ -846,7 +598,7 @@ double computeForceLJ_4xn_half(Parameter *param, Atom *atom, Neighbor *neighbor,
 | 
			
		||||
double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
 | 
			
		||||
    DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
 | 
			
		||||
    int Nlocal = atom->Nlocal;
 | 
			
		||||
    NeighborCluster* neighs;
 | 
			
		||||
    int* neighs;
 | 
			
		||||
    MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
 | 
			
		||||
    MD_FLOAT sigma6 = param->sigma6;
 | 
			
		||||
    MD_FLOAT epsilon = param->epsilon;
 | 
			
		||||
@@ -855,6 +607,8 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
 | 
			
		||||
    MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
 | 
			
		||||
    MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
 | 
			
		||||
    MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
 | 
			
		||||
    double S = getTimeStamp();
 | 
			
		||||
    LIKWID_MARKER_START("force");
 | 
			
		||||
 | 
			
		||||
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
 | 
			
		||||
        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
 | 
			
		||||
@@ -866,13 +620,7 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    double S = getTimeStamp();
 | 
			
		||||
 | 
			
		||||
    #pragma omp parallel
 | 
			
		||||
    {
 | 
			
		||||
    LIKWID_MARKER_START("force");
 | 
			
		||||
 | 
			
		||||
    #pragma omp for
 | 
			
		||||
    #pragma omp parallel for
 | 
			
		||||
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
 | 
			
		||||
        int ci_cj0 = CJ0_FROM_CI(ci);
 | 
			
		||||
        #if CLUSTER_M > CLUSTER_N
 | 
			
		||||
@@ -883,7 +631,6 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
 | 
			
		||||
        MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
 | 
			
		||||
        neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
 | 
			
		||||
        int numneighs = neighbor->numneigh[ci];
 | 
			
		||||
        int numneighs_masked = neighbor->numneigh_masked[ci];
 | 
			
		||||
 | 
			
		||||
        MD_SIMD_FLOAT xi0_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 0]);
 | 
			
		||||
        MD_SIMD_FLOAT xi1_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 1]);
 | 
			
		||||
@@ -910,51 +657,52 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
 | 
			
		||||
        MD_SIMD_FLOAT fiy3 = simd_zero();
 | 
			
		||||
        MD_SIMD_FLOAT fiz3 = simd_zero();
 | 
			
		||||
 | 
			
		||||
        for(int k = 0; k < numneighs_masked; k++) {
 | 
			
		||||
            int cj = neighs[k].cj;
 | 
			
		||||
        for(int k = 0; k < numneighs; k++) {
 | 
			
		||||
            int cj = neighs[k];
 | 
			
		||||
            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
 | 
			
		||||
            int imask = neighs[k].imask;
 | 
			
		||||
            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
 | 
			
		||||
            MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
 | 
			
		||||
            MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
 | 
			
		||||
            MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
 | 
			
		||||
            MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delx1 = xi1_tmp - xj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT dely1 = yi1_tmp - yj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delz1 = zi1_tmp - zj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delx3 = xi3_tmp - xj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT dely3 = yi3_tmp - yj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delz3 = zi3_tmp - zj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
 | 
			
		||||
            MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
 | 
			
		||||
            MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
 | 
			
		||||
            MD_SIMD_FLOAT delx1 = simd_sub(xi1_tmp, xj_tmp);
 | 
			
		||||
            MD_SIMD_FLOAT dely1 = simd_sub(yi1_tmp, yj_tmp);
 | 
			
		||||
            MD_SIMD_FLOAT delz1 = simd_sub(zi1_tmp, zj_tmp);
 | 
			
		||||
            MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
 | 
			
		||||
            MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
 | 
			
		||||
            MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
 | 
			
		||||
            MD_SIMD_FLOAT delx3 = simd_sub(xi3_tmp, xj_tmp);
 | 
			
		||||
            MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp);
 | 
			
		||||
            MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp);
 | 
			
		||||
 | 
			
		||||
            #if CLUSTER_M == CLUSTER_N
 | 
			
		||||
            unsigned int cond0 = (unsigned int)(cj == ci_cj0);
 | 
			
		||||
            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 4 + 0]);
 | 
			
		||||
            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 4 + 1]);
 | 
			
		||||
            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 4 + 2]);
 | 
			
		||||
            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 4 + 3]);
 | 
			
		||||
            #else
 | 
			
		||||
            #if CLUSTER_M < CLUSTER_N
 | 
			
		||||
            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
 | 
			
		||||
            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0));
 | 
			
		||||
            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0));
 | 
			
		||||
            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0));
 | 
			
		||||
            #elif CLUSTER_M < CLUSTER_N
 | 
			
		||||
            unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
 | 
			
		||||
            unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
 | 
			
		||||
            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1));
 | 
			
		||||
            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1));
 | 
			
		||||
            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1));
 | 
			
		||||
            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1));
 | 
			
		||||
            #else
 | 
			
		||||
            unsigned int cond0 = (unsigned int)(cj == ci_cj0);
 | 
			
		||||
            unsigned int cond1 = (unsigned int)(cj == ci_cj1);
 | 
			
		||||
            #endif
 | 
			
		||||
            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 0]);
 | 
			
		||||
            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 1]);
 | 
			
		||||
            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 2]);
 | 
			
		||||
            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32(atom->masks_4xn_fn[cond0 * 8 + cond1 * 4 + 3]);
 | 
			
		||||
            MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond0));
 | 
			
		||||
            MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0x3 - 0x2 * cond0));
 | 
			
		||||
            MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond1));
 | 
			
		||||
            MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0x3 - 0x2 * cond1));
 | 
			
		||||
            #endif
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
 | 
			
		||||
            MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, delz1 * delz1));
 | 
			
		||||
            MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
 | 
			
		||||
            MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, delz3 * delz3));
 | 
			
		||||
            MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
 | 
			
		||||
            MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, simd_mul(delz1, delz1)));
 | 
			
		||||
            MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
 | 
			
		||||
            MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, simd_mul(delz3, delz3)));
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
 | 
			
		||||
            MD_SIMD_MASK cutoff_mask1 = simd_mask_and(excl_mask1, simd_mask_cond_lt(rsq1, cutforcesq_vec));
 | 
			
		||||
@@ -966,88 +714,28 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
 | 
			
		||||
            MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
 | 
			
		||||
            MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
 | 
			
		||||
            MD_SIMD_FLOAT sr6_1 = sr2_1 * sr2_1 * sr2_1 * sigma6_vec;
 | 
			
		||||
            MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
 | 
			
		||||
            MD_SIMD_FLOAT sr6_3 = sr2_3 * sr2_3 * sr2_3 * sigma6_vec;
 | 
			
		||||
            MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
 | 
			
		||||
            MD_SIMD_FLOAT sr6_1 = simd_mul(sr2_1, simd_mul(sr2_1, simd_mul(sr2_1, sigma6_vec)));
 | 
			
		||||
            MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
 | 
			
		||||
            MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec)));
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
 | 
			
		||||
            MD_SIMD_FLOAT force1 = c48_vec * sr6_1 * (sr6_1 - c05_vec) * sr2_1 * eps_vec;
 | 
			
		||||
            MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
 | 
			
		||||
            MD_SIMD_FLOAT force3 = c48_vec * sr6_3 * (sr6_3 - c05_vec) * sr2_3 * eps_vec;
 | 
			
		||||
            MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
 | 
			
		||||
            MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, eps_vec))));
 | 
			
		||||
            MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
 | 
			
		||||
            MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, eps_vec))));
 | 
			
		||||
 | 
			
		||||
            fix0 = simd_masked_add(fix0, delx0 * force0, cutoff_mask0);
 | 
			
		||||
            fiy0 = simd_masked_add(fiy0, dely0 * force0, cutoff_mask0);
 | 
			
		||||
            fiz0 = simd_masked_add(fiz0, delz0 * force0, cutoff_mask0);
 | 
			
		||||
            fix1 = simd_masked_add(fix1, delx1 * force1, cutoff_mask1);
 | 
			
		||||
            fiy1 = simd_masked_add(fiy1, dely1 * force1, cutoff_mask1);
 | 
			
		||||
            fiz1 = simd_masked_add(fiz1, delz1 * force1, cutoff_mask1);
 | 
			
		||||
            fix2 = simd_masked_add(fix2, delx2 * force2, cutoff_mask2);
 | 
			
		||||
            fiy2 = simd_masked_add(fiy2, dely2 * force2, cutoff_mask2);
 | 
			
		||||
            fiz2 = simd_masked_add(fiz2, delz2 * force2, cutoff_mask2);
 | 
			
		||||
            fix3 = simd_masked_add(fix3, delx3 * force3, cutoff_mask3);
 | 
			
		||||
            fiy3 = simd_masked_add(fiy3, dely3 * force3, cutoff_mask3);
 | 
			
		||||
            fiz3 = simd_masked_add(fiz3, delz3 * force3, cutoff_mask3);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        for(int k = numneighs_masked; k < numneighs; k++) {
 | 
			
		||||
            int cj = neighs[k].cj;
 | 
			
		||||
            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
 | 
			
		||||
            int imask = neighs[k].imask;
 | 
			
		||||
            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
 | 
			
		||||
            MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
 | 
			
		||||
            MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
 | 
			
		||||
            MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
 | 
			
		||||
            MD_SIMD_FLOAT delx0 = xi0_tmp - xj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT dely0 = yi0_tmp - yj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delz0 = zi0_tmp - zj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delx1 = xi1_tmp - xj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT dely1 = yi1_tmp - yj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delz1 = zi1_tmp - zj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delx2 = xi2_tmp - xj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT dely2 = yi2_tmp - yj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delz2 = zi2_tmp - zj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delx3 = xi3_tmp - xj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT dely3 = yi3_tmp - yj_tmp;
 | 
			
		||||
            MD_SIMD_FLOAT delz3 = zi3_tmp - zj_tmp;
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, delz0 * delz0));
 | 
			
		||||
            MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, delz1 * delz1));
 | 
			
		||||
            MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, delz2 * delz2));
 | 
			
		||||
            MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, delz3 * delz3));
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec);
 | 
			
		||||
            MD_SIMD_MASK cutoff_mask1 = simd_mask_cond_lt(rsq1, cutforcesq_vec);
 | 
			
		||||
            MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec);
 | 
			
		||||
            MD_SIMD_MASK cutoff_mask3 = simd_mask_cond_lt(rsq3, cutforcesq_vec);
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
 | 
			
		||||
            MD_SIMD_FLOAT sr2_1 = simd_reciprocal(rsq1);
 | 
			
		||||
            MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
 | 
			
		||||
            MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec;
 | 
			
		||||
            MD_SIMD_FLOAT sr6_1 = sr2_1 * sr2_1 * sr2_1 * sigma6_vec;
 | 
			
		||||
            MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec;
 | 
			
		||||
            MD_SIMD_FLOAT sr6_3 = sr2_3 * sr2_3 * sr2_3 * sigma6_vec;
 | 
			
		||||
 | 
			
		||||
            MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec;
 | 
			
		||||
            MD_SIMD_FLOAT force1 = c48_vec * sr6_1 * (sr6_1 - c05_vec) * sr2_1 * eps_vec;
 | 
			
		||||
            MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec;
 | 
			
		||||
            MD_SIMD_FLOAT force3 = c48_vec * sr6_3 * (sr6_3 - c05_vec) * sr2_3 * eps_vec;
 | 
			
		||||
 | 
			
		||||
            fix0 = simd_masked_add(fix0, delx0 * force0, cutoff_mask0);
 | 
			
		||||
            fiy0 = simd_masked_add(fiy0, dely0 * force0, cutoff_mask0);
 | 
			
		||||
            fiz0 = simd_masked_add(fiz0, delz0 * force0, cutoff_mask0);
 | 
			
		||||
            fix1 = simd_masked_add(fix1, delx1 * force1, cutoff_mask1);
 | 
			
		||||
            fiy1 = simd_masked_add(fiy1, dely1 * force1, cutoff_mask1);
 | 
			
		||||
            fiz1 = simd_masked_add(fiz1, delz1 * force1, cutoff_mask1);
 | 
			
		||||
            fix2 = simd_masked_add(fix2, delx2 * force2, cutoff_mask2);
 | 
			
		||||
            fiy2 = simd_masked_add(fiy2, dely2 * force2, cutoff_mask2);
 | 
			
		||||
            fiz2 = simd_masked_add(fiz2, delz2 * force2, cutoff_mask2);
 | 
			
		||||
            fix3 = simd_masked_add(fix3, delx3 * force3, cutoff_mask3);
 | 
			
		||||
            fiy3 = simd_masked_add(fiy3, dely3 * force3, cutoff_mask3);
 | 
			
		||||
            fiz3 = simd_masked_add(fiz3, delz3 * force3, cutoff_mask3);
 | 
			
		||||
            fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
 | 
			
		||||
            fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
 | 
			
		||||
            fiz0 = simd_masked_add(fiz0, simd_mul(delz0, force0), cutoff_mask0);
 | 
			
		||||
            fix1 = simd_masked_add(fix1, simd_mul(delx1, force1), cutoff_mask1);
 | 
			
		||||
            fiy1 = simd_masked_add(fiy1, simd_mul(dely1, force1), cutoff_mask1);
 | 
			
		||||
            fiz1 = simd_masked_add(fiz1, simd_mul(delz1, force1), cutoff_mask1);
 | 
			
		||||
            fix2 = simd_masked_add(fix2, simd_mul(delx2, force2), cutoff_mask2);
 | 
			
		||||
            fiy2 = simd_masked_add(fiy2, simd_mul(dely2, force2), cutoff_mask2);
 | 
			
		||||
            fiz2 = simd_masked_add(fiz2, simd_mul(delz2, force2), cutoff_mask2);
 | 
			
		||||
            fix3 = simd_masked_add(fix3, simd_mul(delx3, force3), cutoff_mask3);
 | 
			
		||||
            fiy3 = simd_masked_add(fiy3, simd_mul(dely3, force3), cutoff_mask3);
 | 
			
		||||
            fiz3 = simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        simd_incr_reduced_sum(&ci_f[CL_X_OFFSET], fix0, fix1, fix2, fix3);
 | 
			
		||||
@@ -1056,13 +744,10 @@ double computeForceLJ_4xn_full(Parameter *param, Atom *atom, Neighbor *neighbor,
 | 
			
		||||
 | 
			
		||||
        addStat(stats->calculated_forces, 1);
 | 
			
		||||
        addStat(stats->num_neighs, numneighs);
 | 
			
		||||
        addStat(stats->force_iters, (long long int)((double)numneighs));
 | 
			
		||||
        //addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
 | 
			
		||||
        addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    LIKWID_MARKER_STOP("force");
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    double E = getTimeStamp();
 | 
			
		||||
    DEBUG_MESSAGE("computeForceLJ_4xn end\n");
 | 
			
		||||
    return E-S;
 | 
			
		||||
 
 | 
			
		||||
@@ -22,8 +22,25 @@
 | 
			
		||||
#   define KERNEL_NAME              "CUDA"
 | 
			
		||||
#   define CLUSTER_M                8
 | 
			
		||||
#   define CLUSTER_N                VECTOR_WIDTH
 | 
			
		||||
#   define UNROLL_J                 1
 | 
			
		||||
 | 
			
		||||
#ifdef USE_SUPER_CLUSTERS
 | 
			
		||||
#   define XX                       0
 | 
			
		||||
#   define YY                       1
 | 
			
		||||
#   define ZZ                       2
 | 
			
		||||
#   define SCLUSTER_SIZE_X          2
 | 
			
		||||
#   define SCLUSTER_SIZE_Y          2
 | 
			
		||||
#   define SCLUSTER_SIZE_Z          2
 | 
			
		||||
#   define SCLUSTER_SIZE            (SCLUSTER_SIZE_X * SCLUSTER_SIZE_Y * SCLUSTER_SIZE_Z)
 | 
			
		||||
#   define DIM_COORD(dim,coord)     ((dim == XX) ? atom_x(coord) : ((dim == YY) ? atom_y(coord) : atom_z(coord)))
 | 
			
		||||
#   define MIN(a,b)                 ({int _a = (a), _b = (b); _a < _b ? _a : _b; })
 | 
			
		||||
#   define SCLUSTER_M               CLUSTER_M * SCLUSTER_SIZE
 | 
			
		||||
 | 
			
		||||
#   define computeForceLJ           computeForceLJSup_cuda
 | 
			
		||||
#else
 | 
			
		||||
#   define computeForceLJ           computeForceLJ_cuda
 | 
			
		||||
 | 
			
		||||
#endif //USE_SUPER_CLUSTERS
 | 
			
		||||
 | 
			
		||||
#   define initialIntegrate         cudaInitialIntegrate
 | 
			
		||||
#   define finalIntegrate           cudaFinalIntegrate
 | 
			
		||||
#   define updatePbc                cudaUpdatePbc
 | 
			
		||||
@@ -33,15 +50,11 @@
 | 
			
		||||
#   if VECTOR_WIDTH > CLUSTER_M * 2
 | 
			
		||||
#       define KERNEL_NAME          "Simd2xNN"
 | 
			
		||||
#       define CLUSTER_N            (VECTOR_WIDTH / 2)
 | 
			
		||||
#       define UNROLL_I             4
 | 
			
		||||
#       define UNROLL_J             2
 | 
			
		||||
#       define computeForceLJ       computeForceLJ_2xnn
 | 
			
		||||
// Simd4xN
 | 
			
		||||
#   else
 | 
			
		||||
#       define KERNEL_NAME          "Simd4xN"
 | 
			
		||||
#       define CLUSTER_N            VECTOR_WIDTH
 | 
			
		||||
#       define UNROLL_I             4
 | 
			
		||||
#       define UNROLL_J             1
 | 
			
		||||
#       define computeForceLJ       computeForceLJ_4xn
 | 
			
		||||
#   endif
 | 
			
		||||
#   ifdef USE_REFERENCE_VERSION
 | 
			
		||||
@@ -60,16 +73,29 @@
 | 
			
		||||
#   define CJ1_FROM_CI(a)           (a)
 | 
			
		||||
#   define CI_BASE_INDEX(a,b)       ((a) * CLUSTER_N * (b))
 | 
			
		||||
#   define CJ_BASE_INDEX(a,b)       ((a) * CLUSTER_N * (b))
 | 
			
		||||
#ifdef USE_SUPER_CLUSTERS
 | 
			
		||||
#   define CJ1_FROM_SCI(a)          (a)
 | 
			
		||||
#   define SCI_BASE_INDEX(a,b)      ((a) * CLUSTER_N * SCLUSTER_SIZE * (b))
 | 
			
		||||
#   define SCJ_BASE_INDEX(a,b)      ((a) * CLUSTER_N * SCLUSTER_SIZE * (b))
 | 
			
		||||
#endif //USE_SUPER_CLUSTERS
 | 
			
		||||
#elif CLUSTER_M == CLUSTER_N * 2 // M > N
 | 
			
		||||
#   define CJ0_FROM_CI(a)           ((a) << 1)
 | 
			
		||||
#   define CJ1_FROM_CI(a)           (((a) << 1) | 0x1)
 | 
			
		||||
#   define CI_BASE_INDEX(a,b)       ((a) * CLUSTER_M * (b))
 | 
			
		||||
#   define CJ_BASE_INDEX(a,b)       (((a) >> 1) * CLUSTER_M * (b) + ((a) & 0x1) * (CLUSTER_M >> 1))
 | 
			
		||||
#ifdef USE_SUPER_CLUSTERS
 | 
			
		||||
#   define SCI_BASE_INDEX(a,b)      ((a) * CLUSTER_M * SCLUSTER_SIZE * (b))
 | 
			
		||||
#   define SCJ_BASE_INDEX(a,b)      (((a) >> 1) * CLUSTER_M * SCLUSTER_SIZE * (b) + ((a) & 0x1) * (SCLUSTER_SIZE * CLUSTER_M >> 1))
 | 
			
		||||
#endif //USE_SUPER_CLUSTERS
 | 
			
		||||
#elif CLUSTER_M == CLUSTER_N / 2 // M < N
 | 
			
		||||
#   define CJ0_FROM_CI(a)           ((a) >> 1)
 | 
			
		||||
#   define CJ1_FROM_CI(a)           ((a) >> 1)
 | 
			
		||||
#   define CI_BASE_INDEX(a,b)       (((a) >> 1) * CLUSTER_N * (b) + ((a) & 0x1) * (CLUSTER_N >> 1))
 | 
			
		||||
#   define CJ_BASE_INDEX(a,b)       ((a) * CLUSTER_N * (b))
 | 
			
		||||
#ifdef USE_SUPER_CLUSTERS
 | 
			
		||||
#   define SCI_BASE_INDEX(a,b)      (((a) >> 1) * CLUSTER_N * SCLUSTER_SIZE * (b) + ((a) & 0x1) * (CLUSTER_N * SCLUSTER_SIZE >> 1))
 | 
			
		||||
#   define SCJ_BASE_INDEX(a,b)      ((a) * CLUSTER_N * SCLUSTER_SIZE * (b))
 | 
			
		||||
#endif //USE_SUPER_CLUSTERS
 | 
			
		||||
#else
 | 
			
		||||
#   error "Invalid cluster configuration!"
 | 
			
		||||
#endif
 | 
			
		||||
@@ -83,14 +109,37 @@
 | 
			
		||||
#define CJ_SCALAR_BASE_INDEX(a)     (CJ_BASE_INDEX(a, 1))
 | 
			
		||||
#define CJ_VECTOR_BASE_INDEX(a)     (CJ_BASE_INDEX(a, 3))
 | 
			
		||||
 | 
			
		||||
#ifdef USE_SUPER_CLUSTERS
 | 
			
		||||
#define SCI_SCALAR_BASE_INDEX(a)    (SCI_BASE_INDEX(a, 1))
 | 
			
		||||
#define SCI_VECTOR_BASE_INDEX(a)    (SCI_BASE_INDEX(a, 3))
 | 
			
		||||
#define SCJ_SCALAR_BASE_INDEX(a)    (SCJ_BASE_INDEX(a, 1))
 | 
			
		||||
#define SCJ_VECTOR_BASE_INDEX(a)    (SCJ_BASE_INDEX(a, 3))
 | 
			
		||||
#endif //USE_SUPER_CLUSTERS
 | 
			
		||||
 | 
			
		||||
#if CLUSTER_M >= CLUSTER_N
 | 
			
		||||
#   define CL_X_OFFSET              (0 * CLUSTER_M)
 | 
			
		||||
#   define CL_Y_OFFSET              (1 * CLUSTER_M)
 | 
			
		||||
#   define CL_Z_OFFSET              (2 * CLUSTER_M)
 | 
			
		||||
 | 
			
		||||
#ifdef USE_SUPER_CLUSTERS
 | 
			
		||||
#   define SCL_CL_X_OFFSET(ci)      (ci * CLUSTER_M + 0 * SCLUSTER_M)
 | 
			
		||||
#   define SCL_CL_Y_OFFSET(ci)      (ci * CLUSTER_M + 1 * SCLUSTER_M)
 | 
			
		||||
#   define SCL_CL_Z_OFFSET(ci)      (ci * CLUSTER_M + 2 * SCLUSTER_M)
 | 
			
		||||
 | 
			
		||||
#   define SCL_X_OFFSET             (0 * SCLUSTER_M)
 | 
			
		||||
#   define SCL_Y_OFFSET             (1 * SCLUSTER_M)
 | 
			
		||||
#   define SCL_Z_OFFSET             (2 * SCLUSTER_M)
 | 
			
		||||
#endif //USE_SUPER_CLUSTERS
 | 
			
		||||
#else
 | 
			
		||||
#   define CL_X_OFFSET              (0 * CLUSTER_N)
 | 
			
		||||
#   define CL_Y_OFFSET              (1 * CLUSTER_N)
 | 
			
		||||
#   define CL_Z_OFFSET              (2 * CLUSTER_N)
 | 
			
		||||
 | 
			
		||||
#ifdef USE_SUPER_CLUSTERS
 | 
			
		||||
#   define SCL_X_OFFSET             (0 * SCLUSTER_SIZE * CLUSTER_N)
 | 
			
		||||
#   define SCL_Y_OFFSET             (1 * SCLUSTER_SIZE * CLUSTER_N)
 | 
			
		||||
#   define SCL_Z_OFFSET             (2 * SCLUSTER_SIZE * CLUSTER_N)
 | 
			
		||||
#endif //USE_SUPER_CLUSTERS
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
typedef struct {
 | 
			
		||||
@@ -100,6 +149,13 @@ typedef struct {
 | 
			
		||||
    MD_FLOAT bbminz, bbmaxz;
 | 
			
		||||
} Cluster;
 | 
			
		||||
 | 
			
		||||
typedef struct {
 | 
			
		||||
    int nclusters;
 | 
			
		||||
    MD_FLOAT bbminx, bbmaxx;
 | 
			
		||||
    MD_FLOAT bbminy, bbmaxy;
 | 
			
		||||
    MD_FLOAT bbminz, bbmaxz;
 | 
			
		||||
} SuperCluster;
 | 
			
		||||
 | 
			
		||||
typedef struct {
 | 
			
		||||
    int Natoms, Nlocal, Nghost, Nmax;
 | 
			
		||||
    int Nclusters, Nclusters_local, Nclusters_ghost, Nclusters_max;
 | 
			
		||||
@@ -121,17 +177,20 @@ typedef struct {
 | 
			
		||||
    Cluster *iclusters, *jclusters;
 | 
			
		||||
    int *icluster_bin;
 | 
			
		||||
    int dummy_cj;
 | 
			
		||||
    MD_UINT *exclusion_filter;
 | 
			
		||||
    MD_FLOAT *diagonal_4xn_j_minus_i;
 | 
			
		||||
    MD_FLOAT *diagonal_2xnn_j_minus_i;
 | 
			
		||||
    unsigned int masks_2xnn_hn[8];
 | 
			
		||||
    unsigned int masks_2xnn_fn[8];
 | 
			
		||||
    unsigned int masks_4xn_hn[16];
 | 
			
		||||
    unsigned int masks_4xn_fn[16];
 | 
			
		||||
 | 
			
		||||
#ifdef USE_SUPER_CLUSTERS
 | 
			
		||||
    int Nsclusters, Nsclusters_local, Nsclusters_ghost, Nsclusters_max;
 | 
			
		||||
    MD_FLOAT *scl_x;
 | 
			
		||||
    MD_FLOAT *scl_v;
 | 
			
		||||
    MD_FLOAT *scl_f;
 | 
			
		||||
    int *scl_type;
 | 
			
		||||
    int *icluster_idx;
 | 
			
		||||
    SuperCluster *siclusters;
 | 
			
		||||
    int *sicluster_bin;
 | 
			
		||||
#endif //USE_SUPER_CLUSTERS
 | 
			
		||||
} Atom;
 | 
			
		||||
 | 
			
		||||
extern void initAtom(Atom*);
 | 
			
		||||
extern void initMasks(Atom*);
 | 
			
		||||
extern void createAtom(Atom*, Parameter*);
 | 
			
		||||
extern int readAtom(Atom*, Parameter*);
 | 
			
		||||
extern int readAtom_pdb(Atom*, Parameter*);
 | 
			
		||||
@@ -139,6 +198,7 @@ extern int readAtom_gro(Atom*, Parameter*);
 | 
			
		||||
extern int readAtom_dmp(Atom*, Parameter*);
 | 
			
		||||
extern void growAtom(Atom*);
 | 
			
		||||
extern void growClusters(Atom*);
 | 
			
		||||
extern void growSuperClusters(Atom*);
 | 
			
		||||
 | 
			
		||||
#ifdef AOS
 | 
			
		||||
#   define POS_DATA_LAYOUT     "AoS"
 | 
			
		||||
 
 | 
			
		||||
@@ -9,35 +9,13 @@
 | 
			
		||||
 | 
			
		||||
#ifndef __NEIGHBOR_H_
 | 
			
		||||
#define __NEIGHBOR_H_
 | 
			
		||||
// Interaction masks from GROMACS, things to remember (maybe these confused just me):
 | 
			
		||||
//   1. These are not "exclusion" masks as the name suggests in GROMACS, but rather
 | 
			
		||||
//      interaction masks (1 = interaction, 0 = no interaction)
 | 
			
		||||
//   2. These are inverted (maybe because that is how you use in AVX2/AVX512 masking),
 | 
			
		||||
//      so read them from right to left (least significant to most significant bit)
 | 
			
		||||
// All interaction mask is the same for all kernels
 | 
			
		||||
#define NBNXN_INTERACTION_MASK_ALL 0xffffffffU
 | 
			
		||||
// 4x4 kernel diagonal mask
 | 
			
		||||
#define NBNXN_INTERACTION_MASK_DIAG 0x08ceU
 | 
			
		||||
// 4x2 kernel diagonal masks
 | 
			
		||||
#define NBNXN_INTERACTION_MASK_DIAG_J2_0 0x0002U
 | 
			
		||||
#define NBNXN_INTERACTION_MASK_DIAG_J2_1 0x002fU
 | 
			
		||||
// 4x8 kernel diagonal masks
 | 
			
		||||
#define NBNXN_INTERACTION_MASK_DIAG_J8_0 0xf0f8fcfeU
 | 
			
		||||
#define NBNXN_INTERACTION_MASK_DIAG_J8_1 0x0080c0e0U
 | 
			
		||||
 | 
			
		||||
typedef struct {
 | 
			
		||||
    int cj;
 | 
			
		||||
    unsigned int imask;
 | 
			
		||||
} NeighborCluster;
 | 
			
		||||
 | 
			
		||||
typedef struct {
 | 
			
		||||
    int every;
 | 
			
		||||
    int ncalls;
 | 
			
		||||
    int* neighbors;
 | 
			
		||||
    int maxneighs;
 | 
			
		||||
    int* numneigh;
 | 
			
		||||
    int* numneigh_masked;
 | 
			
		||||
    int half_neigh;
 | 
			
		||||
    NeighborCluster* neighbors;
 | 
			
		||||
} Neighbor;
 | 
			
		||||
 | 
			
		||||
extern void initNeighbor(Neighbor*, Parameter*);
 | 
			
		||||
@@ -47,6 +25,7 @@ extern void buildNeighbor(Atom*, Neighbor*);
 | 
			
		||||
extern void pruneNeighbor(Parameter*, Atom*, Neighbor*);
 | 
			
		||||
extern void sortAtom(Atom*);
 | 
			
		||||
extern void buildClusters(Atom*);
 | 
			
		||||
extern void buildClustersGPU(Atom*);
 | 
			
		||||
extern void defineJClusters(Atom*);
 | 
			
		||||
extern void binClusters(Atom*);
 | 
			
		||||
extern void updateSingleAtoms(Atom*);
 | 
			
		||||
 
 | 
			
		||||
@@ -16,5 +16,8 @@ extern void setupPbc(Atom*, Parameter*);
 | 
			
		||||
 | 
			
		||||
#ifdef CUDA_TARGET
 | 
			
		||||
extern void cudaUpdatePbc(Atom*, Parameter*, int);
 | 
			
		||||
#if defined(USE_SUPER_CLUSTERS)
 | 
			
		||||
extern void setupPbcGPU(Atom*, Parameter*);
 | 
			
		||||
#endif //defined(USE_SUPER_CLUSTERS)
 | 
			
		||||
#endif
 | 
			
		||||
#endif
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										19
									
								
								gromacs/includes/utils.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								gromacs/includes/utils.h
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,19 @@
 | 
			
		||||
/*
 | 
			
		||||
 * Temporal functions for debugging, remove before proceeding with pull request
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
#ifndef MD_BENCH_UTILS_H
 | 
			
		||||
#define MD_BENCH_UTILS_H
 | 
			
		||||
 | 
			
		||||
#include <atom.h>
 | 
			
		||||
#include <neighbor.h>
 | 
			
		||||
 | 
			
		||||
#ifdef USE_SUPER_CLUSTERS
 | 
			
		||||
void verifyClusters(Atom *atom);
 | 
			
		||||
void verifyLayout(Atom *atom);
 | 
			
		||||
void checkAlignment(Atom *atom);
 | 
			
		||||
void showSuperclusters(Atom *atom);
 | 
			
		||||
void printNeighs(Atom *atom, Neighbor *neighbor);
 | 
			
		||||
#endif //USE_SUPER_CLUSTERS
 | 
			
		||||
 | 
			
		||||
#endif //MD_BENCH_UTILS_H
 | 
			
		||||
@@ -9,6 +9,7 @@
 | 
			
		||||
#ifndef __VTK_H_
 | 
			
		||||
#define __VTK_H_
 | 
			
		||||
extern void write_data_to_vtk_file(const char *filename, Atom* atom, int timestep);
 | 
			
		||||
extern int write_super_clusters_to_vtk_file(const char* filename, Atom* atom, int timestep);
 | 
			
		||||
extern int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
 | 
			
		||||
extern int write_ghost_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
 | 
			
		||||
extern int write_local_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep);
 | 
			
		||||
 
 | 
			
		||||
@@ -38,7 +38,16 @@ extern double computeForceLJ_cuda(Parameter *param, Atom *atom, Neighbor *neighb
 | 
			
		||||
extern void copyDataToCUDADevice(Atom *atom);
 | 
			
		||||
extern void copyDataFromCUDADevice(Atom *atom);
 | 
			
		||||
extern void cudaDeviceFree();
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#ifdef USE_SUPER_CLUSTERS
 | 
			
		||||
#include <utils.h>
 | 
			
		||||
extern void buildNeighborGPU(Atom *atom, Neighbor *neighbor);
 | 
			
		||||
extern void pruneNeighborGPU(Parameter *param, Atom *atom, Neighbor *neighbor);
 | 
			
		||||
extern void alignDataToSuperclusters(Atom *atom);
 | 
			
		||||
extern void alignDataFromSuperclusters(Atom *atom);
 | 
			
		||||
extern double computeForceLJSup_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats);
 | 
			
		||||
#endif //USE_SUPER_CLUSTERS
 | 
			
		||||
#endif //CUDA_TARGET
 | 
			
		||||
 | 
			
		||||
double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) {
 | 
			
		||||
    if(param->force_field == FF_EAM) { initEam(eam, param); }
 | 
			
		||||
@@ -62,11 +71,24 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *
 | 
			
		||||
    setupNeighbor(param, atom);
 | 
			
		||||
    setupThermo(param, atom->Natoms);
 | 
			
		||||
    if(param->input_file == NULL) { adjustThermo(param, atom); }
 | 
			
		||||
    #if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
 | 
			
		||||
    buildClustersGPU(atom);
 | 
			
		||||
    #else
 | 
			
		||||
    buildClusters(atom);
 | 
			
		||||
    #endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
 | 
			
		||||
    defineJClusters(atom);
 | 
			
		||||
    #if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
 | 
			
		||||
    setupPbcGPU(atom, param);
 | 
			
		||||
    //setupPbc(atom, param);
 | 
			
		||||
    #else
 | 
			
		||||
    setupPbc(atom, param);
 | 
			
		||||
    #endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
 | 
			
		||||
    binClusters(atom);
 | 
			
		||||
    #if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
 | 
			
		||||
    buildNeighborGPU(atom, neighbor);
 | 
			
		||||
    #else
 | 
			
		||||
    buildNeighbor(atom, neighbor);
 | 
			
		||||
    #endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
 | 
			
		||||
    initDevice(atom, neighbor);
 | 
			
		||||
    E = getTimeStamp();
 | 
			
		||||
    return E-S;
 | 
			
		||||
@@ -78,11 +100,24 @@ double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
 | 
			
		||||
    LIKWID_MARKER_START("reneighbour");
 | 
			
		||||
    updateSingleAtoms(atom);
 | 
			
		||||
    updateAtomsPbc(atom, param);
 | 
			
		||||
    #if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
 | 
			
		||||
    buildClustersGPU(atom);
 | 
			
		||||
    #else
 | 
			
		||||
    buildClusters(atom);
 | 
			
		||||
    #endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
 | 
			
		||||
    defineJClusters(atom);
 | 
			
		||||
    #if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
 | 
			
		||||
    //setupPbcGPU(atom, param);
 | 
			
		||||
    setupPbc(atom, param);
 | 
			
		||||
    #else
 | 
			
		||||
    setupPbc(atom, param);
 | 
			
		||||
    #endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
 | 
			
		||||
    binClusters(atom);
 | 
			
		||||
    #if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
 | 
			
		||||
    buildNeighborGPU(atom, neighbor);
 | 
			
		||||
    #else
 | 
			
		||||
    buildNeighbor(atom, neighbor);
 | 
			
		||||
    #endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
 | 
			
		||||
    LIKWID_MARKER_STOP("reneighbour");
 | 
			
		||||
    E = getTimeStamp();
 | 
			
		||||
    return E-S;
 | 
			
		||||
@@ -209,6 +244,8 @@ int main(int argc, char** argv) {
 | 
			
		||||
    printParameter(¶m);
 | 
			
		||||
    printf(HLINE);
 | 
			
		||||
 | 
			
		||||
    //verifyNeigh(&atom, &neighbor);
 | 
			
		||||
 | 
			
		||||
    printf("step\ttemp\t\tpressure\n");
 | 
			
		||||
    computeThermo(0, ¶m, &atom);
 | 
			
		||||
    #if defined(MEM_TRACER) || defined(INDEX_TRACER)
 | 
			
		||||
@@ -237,14 +274,23 @@ int main(int argc, char** argv) {
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    for(int n = 0; n < param.ntimes; n++) {
 | 
			
		||||
 | 
			
		||||
        //printf("Step:\t%d\r\n", n);
 | 
			
		||||
 | 
			
		||||
        initialIntegrate(¶m, &atom);
 | 
			
		||||
 | 
			
		||||
        if((n + 1) % param.reneigh_every) {
 | 
			
		||||
            if(!((n + 1) % param.prune_every)) {
 | 
			
		||||
                #if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
 | 
			
		||||
                pruneNeighborGPU(¶m, &atom, &neighbor);
 | 
			
		||||
                #else
 | 
			
		||||
                pruneNeighbor(¶m, &atom, &neighbor);
 | 
			
		||||
                #endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            copyDataFromCUDADevice(&atom);
 | 
			
		||||
            updatePbc(&atom, ¶m, 0);
 | 
			
		||||
            copyDataToCUDADevice(&atom);
 | 
			
		||||
        } else {
 | 
			
		||||
            #ifdef CUDA_TARGET
 | 
			
		||||
            copyDataFromCUDADevice(&atom);
 | 
			
		||||
@@ -262,12 +308,29 @@ int main(int argc, char** argv) {
 | 
			
		||||
        traceAddresses(¶m, &atom, &neighbor, n + 1);
 | 
			
		||||
        #endif
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        /*
 | 
			
		||||
        printf("%d\t%d\r\n", atom.Nsclusters_local, atom.Nclusters_local);
 | 
			
		||||
        copyDataToCUDADevice(&atom);
 | 
			
		||||
        verifyLayout(&atom);
 | 
			
		||||
 | 
			
		||||
        //printClusterIndices(&atom);
 | 
			
		||||
 | 
			
		||||
        */
 | 
			
		||||
 | 
			
		||||
        if(param.force_field == FF_EAM) {
 | 
			
		||||
            timer[FORCE] += computeForceEam(&eam, ¶m, &atom, &neighbor, &stats);
 | 
			
		||||
        } else {
 | 
			
		||||
            timer[FORCE] += computeForceLJ(¶m, &atom, &neighbor, &stats);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        /*
 | 
			
		||||
        copyDataFromCUDADevice(&atom);
 | 
			
		||||
        verifyLayout(&atom);
 | 
			
		||||
 | 
			
		||||
        getchar();
 | 
			
		||||
        */
 | 
			
		||||
 | 
			
		||||
        finalIntegrate(¶m, &atom);
 | 
			
		||||
 | 
			
		||||
        if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
 | 
			
		||||
 
 | 
			
		||||
@@ -56,7 +56,6 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) {
 | 
			
		||||
    neighbor->half_neigh = param->half_neigh;
 | 
			
		||||
    neighbor->maxneighs = 100;
 | 
			
		||||
    neighbor->numneigh = NULL;
 | 
			
		||||
    neighbor->numneigh_masked = NULL;
 | 
			
		||||
    neighbor->neighbors = NULL;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -78,8 +77,13 @@ void setupNeighbor(Parameter *param, Atom *atom) {
 | 
			
		||||
 | 
			
		||||
    MD_FLOAT atom_density = ((MD_FLOAT)(atom->Nlocal)) / ((xhi - xlo) * (yhi - ylo) * (zhi - zlo));
 | 
			
		||||
    MD_FLOAT atoms_in_cell = MAX(CLUSTER_M, CLUSTER_N);
 | 
			
		||||
    #ifdef USE_SUPER_CLUSTERS
 | 
			
		||||
    MD_FLOAT targetsizex = cbrt(atoms_in_cell / atom_density) * (MD_FLOAT)SCLUSTER_SIZE_X;
 | 
			
		||||
    MD_FLOAT targetsizey = cbrt(atoms_in_cell / atom_density) * (MD_FLOAT)SCLUSTER_SIZE_Y;
 | 
			
		||||
    #else
 | 
			
		||||
    MD_FLOAT targetsizex = cbrt(atoms_in_cell / atom_density);
 | 
			
		||||
    MD_FLOAT targetsizey = cbrt(atoms_in_cell / atom_density);
 | 
			
		||||
    #endif
 | 
			
		||||
    nbinx = MAX(1, (int)ceil((xhi - xlo) / targetsizex));
 | 
			
		||||
    nbiny = MAX(1, (int)ceil((yhi - ylo) / targetsizey));
 | 
			
		||||
    binsizex = (xhi - xlo) / nbinx;
 | 
			
		||||
@@ -185,43 +189,29 @@ int atomDistanceInRange(Atom *atom, int ci, int cj, MD_FLOAT rsq) {
 | 
			
		||||
    return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* Returns a diagonal or off-diagonal interaction mask for plain C lists */
 | 
			
		||||
static unsigned int get_imask(int rdiag, int ci, int cj) {
 | 
			
		||||
    return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
 | 
			
		||||
}
 | 
			
		||||
int atomDistanceInRangeGPU(Atom *atom, int sci, int cj, MD_FLOAT rsq) {
 | 
			
		||||
    for (int ci = 0; ci < atom->siclusters[sci].nclusters; ci++) {
 | 
			
		||||
        const int icluster_idx = atom->icluster_idx[SCLUSTER_SIZE * sci + ci];
 | 
			
		||||
        int ci_vec_base = CI_VECTOR_BASE_INDEX(icluster_idx);
 | 
			
		||||
        int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
 | 
			
		||||
        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
 | 
			
		||||
        MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
 | 
			
		||||
 | 
			
		||||
/* Returns a diagonal or off-diagonal interaction mask for cj-size=2 */
 | 
			
		||||
static unsigned int get_imask_simd_j2(int rdiag, int ci, int cj) {
 | 
			
		||||
    return (rdiag && ci * 2 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_0
 | 
			
		||||
                                  : (rdiag && ci * 2 + 1 == cj ? NBNXN_INTERACTION_MASK_DIAG_J2_1
 | 
			
		||||
                                                               : NBNXN_INTERACTION_MASK_ALL));
 | 
			
		||||
}
 | 
			
		||||
        for(int cii = 0; cii < atom->iclusters[icluster_idx].natoms; cii++) {
 | 
			
		||||
            for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
 | 
			
		||||
                MD_FLOAT delx = ci_x[CL_X_OFFSET + cii] - cj_x[CL_X_OFFSET + cjj];
 | 
			
		||||
                MD_FLOAT dely = ci_x[CL_Y_OFFSET + cii] - cj_x[CL_Y_OFFSET + cjj];
 | 
			
		||||
                MD_FLOAT delz = ci_x[CL_Z_OFFSET + cii] - cj_x[CL_Z_OFFSET + cjj];
 | 
			
		||||
                if(delx * delx + dely * dely + delz * delz < rsq) {
 | 
			
		||||
                    return 1;
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
/* Returns a diagonal or off-diagonal interaction mask for cj-size=4 */
 | 
			
		||||
static unsigned int get_imask_simd_j4(int rdiag, int ci, int cj) {
 | 
			
		||||
    return (rdiag && ci == cj ? NBNXN_INTERACTION_MASK_DIAG : NBNXN_INTERACTION_MASK_ALL);
 | 
			
		||||
    return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* Returns a diagonal or off-diagonal interaction mask for cj-size=8 */
 | 
			
		||||
static unsigned int get_imask_simd_j8(int rdiag, int ci, int cj) {
 | 
			
		||||
    return (rdiag && ci == cj * 2 ? NBNXN_INTERACTION_MASK_DIAG_J8_0
 | 
			
		||||
                                  : (rdiag && ci == cj * 2 + 1 ? NBNXN_INTERACTION_MASK_DIAG_J8_1
 | 
			
		||||
                                                               : NBNXN_INTERACTION_MASK_ALL));
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#if VECTOR_WIDTH == 2
 | 
			
		||||
#   define get_imask_simd_4xn get_imask_simd_j2
 | 
			
		||||
#elif VECTOR_WIDTH== 4
 | 
			
		||||
#   define get_imask_simd_4xn get_imask_simd_j4
 | 
			
		||||
#elif VECTOR_WIDTH == 8
 | 
			
		||||
#   define get_imask_simd_4xn get_imask_simd_j8
 | 
			
		||||
#   define get_imask_simd_2xnn get_imask_simd_j4
 | 
			
		||||
#elif VECTOR_WIDTH == 16
 | 
			
		||||
#   define get_imask_simd_2xnn get_imask_simd_j8
 | 
			
		||||
#else
 | 
			
		||||
#   error "Invalid cluster configuration"
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
void buildNeighbor(Atom *atom, Neighbor *neighbor) {
 | 
			
		||||
    DEBUG_MESSAGE("buildNeighbor start\n");
 | 
			
		||||
 | 
			
		||||
@@ -231,8 +221,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
 | 
			
		||||
        if(neighbor->numneigh) free(neighbor->numneigh);
 | 
			
		||||
        if(neighbor->neighbors) free(neighbor->neighbors);
 | 
			
		||||
        neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
 | 
			
		||||
        neighbor->numneigh_masked = (int*) malloc(nmax * sizeof(int));
 | 
			
		||||
        neighbor->neighbors = (NeighborCluster*) malloc(nmax * neighbor->maxneighs * sizeof(NeighborCluster));
 | 
			
		||||
        neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    MD_FLOAT bbx = 0.5 * (binsizex + binsizex);
 | 
			
		||||
@@ -248,8 +237,8 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
 | 
			
		||||
 | 
			
		||||
        for(int ci = 0; ci < atom->Nclusters_local; ci++) {
 | 
			
		||||
            int ci_cj1 = CJ1_FROM_CI(ci);
 | 
			
		||||
            NeighborCluster *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
 | 
			
		||||
            int n = 0, nmasked = 0;
 | 
			
		||||
            int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
 | 
			
		||||
            int n = 0;
 | 
			
		||||
            int ibin = atom->icluster_bin[ci];
 | 
			
		||||
            MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx;
 | 
			
		||||
            MD_FLOAT ibb_xmax = atom->iclusters[ci].bbmaxx;
 | 
			
		||||
@@ -314,28 +303,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
 | 
			
		||||
 | 
			
		||||
                            if(d_bb_sq < cutneighsq) {
 | 
			
		||||
                                if(d_bb_sq < rbb_sq || atomDistanceInRange(atom, ci, cj, cutneighsq)) {
 | 
			
		||||
                                    // We use true (1) for rdiag because we only care if there are masks
 | 
			
		||||
                                    // at all, and when this is set to false (0) the self-exclusions are
 | 
			
		||||
                                    // not accounted for, which  makes the optimized version to not work!
 | 
			
		||||
                                    unsigned int imask;
 | 
			
		||||
                                    #if CLUSTER_N == (VECTOR_WIDTH / 2) // 2xnn
 | 
			
		||||
                                    imask = get_imask_simd_2xnn(1, ci, cj);
 | 
			
		||||
                                    #else // 4xn
 | 
			
		||||
                                    imask = get_imask_simd_4xn(1, ci, cj);
 | 
			
		||||
                                    #endif
 | 
			
		||||
 | 
			
		||||
                                    if(imask == NBNXN_INTERACTION_MASK_ALL) {
 | 
			
		||||
                                        neighptr[n].cj = cj;
 | 
			
		||||
                                        neighptr[n].imask = imask;
 | 
			
		||||
                                    } else {
 | 
			
		||||
                                        neighptr[n].cj = neighptr[nmasked].cj;
 | 
			
		||||
                                        neighptr[n].imask = neighptr[nmasked].imask;
 | 
			
		||||
                                        neighptr[nmasked].cj = cj;
 | 
			
		||||
                                        neighptr[nmasked].imask = imask;
 | 
			
		||||
                                        nmasked++;
 | 
			
		||||
                                    }
 | 
			
		||||
 | 
			
		||||
                                    n++;
 | 
			
		||||
                                    neighptr[n++] = cj;
 | 
			
		||||
                                }
 | 
			
		||||
                            }
 | 
			
		||||
                        }
 | 
			
		||||
@@ -357,14 +325,11 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
 | 
			
		||||
            // Fill neighbor list with dummy values to fit vector width
 | 
			
		||||
            if(CLUSTER_N < VECTOR_WIDTH) {
 | 
			
		||||
                while(n % (VECTOR_WIDTH / CLUSTER_N)) {
 | 
			
		||||
                    neighptr[n].cj = atom->dummy_cj; // Last cluster is always a dummy cluster
 | 
			
		||||
                    neighptr[n].imask = 0;
 | 
			
		||||
                    n++;
 | 
			
		||||
                    neighptr[n++] = atom->dummy_cj; // Last cluster is always a dummy cluster
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            neighbor->numneigh[ci] = n;
 | 
			
		||||
            neighbor->numneigh_masked[ci] = nmasked;
 | 
			
		||||
            if(n >= neighbor->maxneighs) {
 | 
			
		||||
                resize = 1;
 | 
			
		||||
 | 
			
		||||
@@ -378,7 +343,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
 | 
			
		||||
            fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs);
 | 
			
		||||
            neighbor->maxneighs = new_maxneighs * 1.2;
 | 
			
		||||
            free(neighbor->neighbors);
 | 
			
		||||
            neighbor->neighbors = (NeighborCluster*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
 | 
			
		||||
            neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
@@ -427,33 +392,212 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
 | 
			
		||||
    DEBUG_MESSAGE("buildNeighbor end\n");
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#ifdef USE_SUPER_CLUSTERS
 | 
			
		||||
// TODO For future parallelization on GPU
 | 
			
		||||
void buildNeighborGPU(Atom *atom, Neighbor *neighbor) {
 | 
			
		||||
    DEBUG_MESSAGE("buildNeighborGPU start\n");
 | 
			
		||||
 | 
			
		||||
    /* extend atom arrays if necessary */
 | 
			
		||||
    if(atom->Nsclusters_local > nmax) {
 | 
			
		||||
        nmax = atom->Nsclusters_local;
 | 
			
		||||
        if(neighbor->numneigh) free(neighbor->numneigh);
 | 
			
		||||
        if(neighbor->neighbors) free(neighbor->neighbors);
 | 
			
		||||
        neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
 | 
			
		||||
        neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    MD_FLOAT bbx = 0.5 * (binsizex + binsizex);
 | 
			
		||||
    MD_FLOAT bby = 0.5 * (binsizey + binsizey);
 | 
			
		||||
    MD_FLOAT rbb_sq = MAX(0.0, cutneigh - 0.5 * sqrt(bbx * bbx + bby * bby));
 | 
			
		||||
    rbb_sq = rbb_sq * rbb_sq;
 | 
			
		||||
    int resize = 1;
 | 
			
		||||
 | 
			
		||||
    /* loop over each atom, storing neighbors */
 | 
			
		||||
    while(resize) {
 | 
			
		||||
        int new_maxneighs = neighbor->maxneighs;
 | 
			
		||||
        resize = 0;
 | 
			
		||||
 | 
			
		||||
        for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
 | 
			
		||||
            int ci_cj1 = CJ1_FROM_SCI(sci);
 | 
			
		||||
            int *neighptr = &(neighbor->neighbors[sci * neighbor->maxneighs]);
 | 
			
		||||
            int n = 0;
 | 
			
		||||
            int ibin = atom->sicluster_bin[sci];
 | 
			
		||||
            MD_FLOAT ibb_xmin = atom->siclusters[sci].bbminx;
 | 
			
		||||
            MD_FLOAT ibb_xmax = atom->siclusters[sci].bbmaxx;
 | 
			
		||||
            MD_FLOAT ibb_ymin = atom->siclusters[sci].bbminy;
 | 
			
		||||
            MD_FLOAT ibb_ymax = atom->siclusters[sci].bbmaxy;
 | 
			
		||||
            MD_FLOAT ibb_zmin = atom->siclusters[sci].bbminz;
 | 
			
		||||
            MD_FLOAT ibb_zmax = atom->siclusters[sci].bbmaxz;
 | 
			
		||||
 | 
			
		||||
            for(int k = 0; k < nstencil; k++) {
 | 
			
		||||
                int jbin = ibin + stencil[k];
 | 
			
		||||
                int *loc_bin = &bin_clusters[jbin * clusters_per_bin];
 | 
			
		||||
                int cj, m = -1;
 | 
			
		||||
                MD_FLOAT jbb_xmin, jbb_xmax, jbb_ymin, jbb_ymax, jbb_zmin, jbb_zmax;
 | 
			
		||||
                const int c = bin_nclusters[jbin];
 | 
			
		||||
 | 
			
		||||
                if(c > 0) {
 | 
			
		||||
                    MD_FLOAT dl, dh, dm, dm0, d_bb_sq;
 | 
			
		||||
 | 
			
		||||
                    do {
 | 
			
		||||
                        m++;
 | 
			
		||||
                        cj = loc_bin[m];
 | 
			
		||||
                        if(neighbor->half_neigh && ci_cj1 > cj) {
 | 
			
		||||
                            continue;
 | 
			
		||||
                        }
 | 
			
		||||
                        jbb_zmin = atom->jclusters[cj].bbminz;
 | 
			
		||||
                        jbb_zmax = atom->jclusters[cj].bbmaxz;
 | 
			
		||||
                        dl = ibb_zmin - jbb_zmax;
 | 
			
		||||
                        dh = jbb_zmin - ibb_zmax;
 | 
			
		||||
                        dm = MAX(dl, dh);
 | 
			
		||||
                        dm0 = MAX(dm, 0.0);
 | 
			
		||||
                        d_bb_sq = dm0 * dm0;
 | 
			
		||||
                    } while(m + 1 < c && d_bb_sq > cutneighsq);
 | 
			
		||||
 | 
			
		||||
                    jbb_xmin = atom->jclusters[cj].bbminx;
 | 
			
		||||
                    jbb_xmax = atom->jclusters[cj].bbmaxx;
 | 
			
		||||
                    jbb_ymin = atom->jclusters[cj].bbminy;
 | 
			
		||||
                    jbb_ymax = atom->jclusters[cj].bbmaxy;
 | 
			
		||||
 | 
			
		||||
                    while(m < c) {
 | 
			
		||||
                        if(!neighbor->half_neigh || ci_cj1 <= cj) {
 | 
			
		||||
                            dl = ibb_zmin - jbb_zmax;
 | 
			
		||||
                            dh = jbb_zmin - ibb_zmax;
 | 
			
		||||
                            dm = MAX(dl, dh);
 | 
			
		||||
                            dm0 = MAX(dm, 0.0);
 | 
			
		||||
                            d_bb_sq = dm0 * dm0;
 | 
			
		||||
 | 
			
		||||
                            /*if(d_bb_sq > cutneighsq) {
 | 
			
		||||
                                break;
 | 
			
		||||
                            }*/
 | 
			
		||||
 | 
			
		||||
                            dl = ibb_ymin - jbb_ymax;
 | 
			
		||||
                            dh = jbb_ymin - ibb_ymax;
 | 
			
		||||
                            dm = MAX(dl, dh);
 | 
			
		||||
                            dm0 = MAX(dm, 0.0);
 | 
			
		||||
                            d_bb_sq += dm0 * dm0;
 | 
			
		||||
 | 
			
		||||
                            dl = ibb_xmin - jbb_xmax;
 | 
			
		||||
                            dh = jbb_xmin - ibb_xmax;
 | 
			
		||||
                            dm = MAX(dl, dh);
 | 
			
		||||
                            dm0 = MAX(dm, 0.0);
 | 
			
		||||
                            d_bb_sq += dm0 * dm0;
 | 
			
		||||
 | 
			
		||||
                            if(d_bb_sq < cutneighsq) {
 | 
			
		||||
                                if(d_bb_sq < rbb_sq || atomDistanceInRangeGPU(atom, sci, cj, cutneighsq)) {
 | 
			
		||||
                                    neighptr[n++] = cj;
 | 
			
		||||
                                }
 | 
			
		||||
                            }
 | 
			
		||||
                        }
 | 
			
		||||
 | 
			
		||||
                        m++;
 | 
			
		||||
                        if(m < c) {
 | 
			
		||||
                            cj = loc_bin[m];
 | 
			
		||||
                            jbb_xmin = atom->jclusters[cj].bbminx;
 | 
			
		||||
                            jbb_xmax = atom->jclusters[cj].bbmaxx;
 | 
			
		||||
                            jbb_ymin = atom->jclusters[cj].bbminy;
 | 
			
		||||
                            jbb_ymax = atom->jclusters[cj].bbmaxy;
 | 
			
		||||
                            jbb_zmin = atom->jclusters[cj].bbminz;
 | 
			
		||||
                            jbb_zmax = atom->jclusters[cj].bbmaxz;
 | 
			
		||||
                        }
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            // Fill neighbor list with dummy values to fit vector width
 | 
			
		||||
            if(CLUSTER_N < VECTOR_WIDTH) {
 | 
			
		||||
                while(n % (VECTOR_WIDTH / CLUSTER_N)) {
 | 
			
		||||
                    neighptr[n++] = atom->dummy_cj; // Last cluster is always a dummy cluster
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            neighbor->numneigh[sci] = n;
 | 
			
		||||
            if(n >= neighbor->maxneighs) {
 | 
			
		||||
                resize = 1;
 | 
			
		||||
 | 
			
		||||
                if(n >= new_maxneighs) {
 | 
			
		||||
                    new_maxneighs = n;
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if(resize) {
 | 
			
		||||
            fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs);
 | 
			
		||||
            neighbor->maxneighs = new_maxneighs * 1.2;
 | 
			
		||||
            free(neighbor->neighbors);
 | 
			
		||||
            neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /*
 | 
			
		||||
    DEBUG_MESSAGE("\ncutneighsq = %f, rbb_sq = %f\n", cutneighsq, rbb_sq);
 | 
			
		||||
    for(int ci = 0; ci < 6; ci++) {
 | 
			
		||||
        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
 | 
			
		||||
        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
 | 
			
		||||
        int* neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
 | 
			
		||||
 | 
			
		||||
        DEBUG_MESSAGE("Cluster %d, bbx = {%f, %f}, bby = {%f, %f}, bbz = {%f, %f}\n",
 | 
			
		||||
            ci,
 | 
			
		||||
            atom->iclusters[ci].bbminx,
 | 
			
		||||
            atom->iclusters[ci].bbmaxx,
 | 
			
		||||
            atom->iclusters[ci].bbminy,
 | 
			
		||||
            atom->iclusters[ci].bbmaxy,
 | 
			
		||||
            atom->iclusters[ci].bbminz,
 | 
			
		||||
            atom->iclusters[ci].bbmaxz);
 | 
			
		||||
 | 
			
		||||
        for(int cii = 0; cii < CLUSTER_M; cii++) {
 | 
			
		||||
            DEBUG_MESSAGE("%f, %f, %f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        DEBUG_MESSAGE("Neighbors:\n");
 | 
			
		||||
        for(int k = 0; k < neighbor->numneigh[ci]; k++) {
 | 
			
		||||
            int cj = neighptr[k];
 | 
			
		||||
            int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
 | 
			
		||||
            MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
 | 
			
		||||
 | 
			
		||||
            DEBUG_MESSAGE("    Cluster %d, bbx = {%f, %f}, bby = {%f, %f}, bbz = {%f, %f}\n",
 | 
			
		||||
                cj,
 | 
			
		||||
                atom->jclusters[cj].bbminx,
 | 
			
		||||
                atom->jclusters[cj].bbmaxx,
 | 
			
		||||
                atom->jclusters[cj].bbminy,
 | 
			
		||||
                atom->jclusters[cj].bbmaxy,
 | 
			
		||||
                atom->jclusters[cj].bbminz,
 | 
			
		||||
                atom->jclusters[cj].bbmaxz);
 | 
			
		||||
 | 
			
		||||
            for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
 | 
			
		||||
                DEBUG_MESSAGE("    %f, %f, %f\n", cj_x[CL_X_OFFSET + cjj], cj_x[CL_Y_OFFSET + cjj], cj_x[CL_Z_OFFSET + cjj]);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
    */
 | 
			
		||||
 | 
			
		||||
    DEBUG_MESSAGE("buildNeighborGPU end\n");
 | 
			
		||||
}
 | 
			
		||||
#endif //USE_SUPER_CLUSTERS
 | 
			
		||||
 | 
			
		||||
void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
 | 
			
		||||
    DEBUG_MESSAGE("pruneNeighbor start\n");
 | 
			
		||||
    //MD_FLOAT cutsq = param->cutforce * param->cutforce;
 | 
			
		||||
    MD_FLOAT cutsq = cutneighsq;
 | 
			
		||||
 | 
			
		||||
    for(int ci = 0; ci < atom->Nclusters_local; ci++) {
 | 
			
		||||
        NeighborCluster *neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
 | 
			
		||||
        int *neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
 | 
			
		||||
        int numneighs = neighbor->numneigh[ci];
 | 
			
		||||
        int numneighs_masked = neighbor->numneigh_masked[ci];
 | 
			
		||||
        int k = 0;
 | 
			
		||||
 | 
			
		||||
        // Remove dummy clusters if necessary
 | 
			
		||||
        if(CLUSTER_N < VECTOR_WIDTH) {
 | 
			
		||||
            while(neighs[numneighs - 1].cj == atom->dummy_cj) {
 | 
			
		||||
            while(neighs[numneighs - 1] == atom->dummy_cj) {
 | 
			
		||||
                numneighs--;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        while(k < numneighs) {
 | 
			
		||||
            int cj = neighs[k].cj;
 | 
			
		||||
            int cj = neighs[k];
 | 
			
		||||
            if(atomDistanceInRange(atom, ci, cj, cutsq)) {
 | 
			
		||||
                k++;
 | 
			
		||||
            } else {
 | 
			
		||||
                numneighs--;
 | 
			
		||||
                if(k < numneighs_masked) {
 | 
			
		||||
                    numneighs_masked--;
 | 
			
		||||
                }
 | 
			
		||||
                neighs[k] = neighs[numneighs];
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
@@ -461,19 +605,63 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
 | 
			
		||||
        // Readd dummy clusters if necessary
 | 
			
		||||
        if(CLUSTER_N < VECTOR_WIDTH) {
 | 
			
		||||
            while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) {
 | 
			
		||||
                neighs[numneighs].cj = atom->dummy_cj; // Last cluster is always a dummy cluster
 | 
			
		||||
                neighs[numneighs].imask = 0;
 | 
			
		||||
                numneighs++;
 | 
			
		||||
                neighs[numneighs++] = atom->dummy_cj; // Last cluster is always a dummy cluster
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        neighbor->numneigh[ci] = numneighs;
 | 
			
		||||
        neighbor->numneigh_masked[ci] = numneighs_masked;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    DEBUG_MESSAGE("pruneNeighbor end\n");
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#ifdef USE_SUPER_CLUSTERS
 | 
			
		||||
void pruneNeighborGPU(Parameter *param, Atom *atom, Neighbor *neighbor) {
 | 
			
		||||
    DEBUG_MESSAGE("pruneNeighbor start\n");
 | 
			
		||||
    //MD_FLOAT cutsq = param->cutforce * param->cutforce;
 | 
			
		||||
    MD_FLOAT cutsq = cutneighsq;
 | 
			
		||||
 | 
			
		||||
    for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
 | 
			
		||||
        for (int scii = 0; scii < atom->siclusters[sci].nclusters; scii++) {
 | 
			
		||||
            //const int ci = atom->siclusters[sci].iclusters[scii];
 | 
			
		||||
            const int ci = atom->icluster_idx[SCLUSTER_SIZE * sci + ci];
 | 
			
		||||
 | 
			
		||||
            int *neighs = &neighbor->neighbors[sci * neighbor->maxneighs];
 | 
			
		||||
            int numneighs = neighbor->numneigh[sci];
 | 
			
		||||
            int k = 0;
 | 
			
		||||
 | 
			
		||||
            // Remove dummy clusters if necessary
 | 
			
		||||
            if(CLUSTER_N < VECTOR_WIDTH) {
 | 
			
		||||
                while(neighs[numneighs - 1] == atom->dummy_cj) {
 | 
			
		||||
                    numneighs--;
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            while(k < numneighs) {
 | 
			
		||||
                int cj = neighs[k];
 | 
			
		||||
                if(atomDistanceInRange(atom, ci, cj, cutsq)) {
 | 
			
		||||
                    k++;
 | 
			
		||||
                } else {
 | 
			
		||||
                    numneighs--;
 | 
			
		||||
                    neighs[k] = neighs[numneighs];
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            // Readd dummy clusters if necessary
 | 
			
		||||
            if(CLUSTER_N < VECTOR_WIDTH) {
 | 
			
		||||
                while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) {
 | 
			
		||||
                    neighs[numneighs++] = atom->dummy_cj; // Last cluster is always a dummy cluster
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            neighbor->numneigh[sci] = numneighs;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    DEBUG_MESSAGE("pruneNeighbor end\n");
 | 
			
		||||
}
 | 
			
		||||
#endif //USE_SUPER_CLUSTERS
 | 
			
		||||
 | 
			
		||||
/* internal subroutines */
 | 
			
		||||
MD_FLOAT bindist(int i, int j) {
 | 
			
		||||
    MD_FLOAT delx, dely, delz;
 | 
			
		||||
@@ -599,6 +787,36 @@ void sortAtomsByZCoord(Atom *atom) {
 | 
			
		||||
    DEBUG_MESSAGE("sortAtomsByZCoord end\n");
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#ifdef USE_SUPER_CLUSTERS
 | 
			
		||||
// TODO: Use pigeonhole sorting
 | 
			
		||||
void sortAtomsByCoord(Atom *atom, int dim, int bin, int start_index, int end_index) {
 | 
			
		||||
    //DEBUG_MESSAGE("sortAtomsByCoord start\n");
 | 
			
		||||
    int *bin_ptr = &bins[bin * atoms_per_bin];
 | 
			
		||||
 | 
			
		||||
    for(int ac_i = start_index; ac_i <= end_index; ac_i++) {
 | 
			
		||||
        int i = bin_ptr[ac_i];
 | 
			
		||||
        int min_ac = ac_i;
 | 
			
		||||
        int min_idx = i;
 | 
			
		||||
        MD_FLOAT min_coord = DIM_COORD(dim, i);
 | 
			
		||||
 | 
			
		||||
        for(int ac_j = ac_i + 1; ac_j <= end_index; ac_j++) {
 | 
			
		||||
            int j = bin_ptr[ac_j];
 | 
			
		||||
            MD_FLOAT coordj = DIM_COORD(dim, j);
 | 
			
		||||
            if(coordj < min_coord) {
 | 
			
		||||
                min_ac = ac_j;
 | 
			
		||||
                min_idx = j;
 | 
			
		||||
                min_coord = coordj;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        bin_ptr[ac_i] = min_idx;
 | 
			
		||||
        bin_ptr[min_ac] = i;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    //DEBUG_MESSAGE("sortAtomsByCoord end\n");
 | 
			
		||||
}
 | 
			
		||||
#endif //USE_SUPER_CLUSTERS
 | 
			
		||||
 | 
			
		||||
void buildClusters(Atom *atom) {
 | 
			
		||||
    DEBUG_MESSAGE("buildClusters start\n");
 | 
			
		||||
    atom->Nclusters_local = 0;
 | 
			
		||||
@@ -675,6 +893,153 @@ void buildClusters(Atom *atom) {
 | 
			
		||||
    DEBUG_MESSAGE("buildClusters end\n");
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#ifdef USE_SUPER_CLUSTERS
 | 
			
		||||
void buildClustersGPU(Atom *atom) {
 | 
			
		||||
    DEBUG_MESSAGE("buildClustersGPU start\n");
 | 
			
		||||
    atom->Nclusters_local = 0;
 | 
			
		||||
    atom->Nsclusters_local = 0;
 | 
			
		||||
 | 
			
		||||
    /* bin local atoms */
 | 
			
		||||
    binAtoms(atom);
 | 
			
		||||
 | 
			
		||||
    for(int bin = 0; bin < mbins; bin++) {
 | 
			
		||||
        int c = bincount[bin];
 | 
			
		||||
        sortAtomsByCoord(atom, ZZ, bin, 0, c - 1);
 | 
			
		||||
        int ac = 0;
 | 
			
		||||
        int nclusters = ((c + CLUSTER_M - 1) / CLUSTER_M);
 | 
			
		||||
        if(CLUSTER_N > CLUSTER_M && nclusters % 2) { nclusters++; }
 | 
			
		||||
        const int supercluster_size = SCLUSTER_SIZE_X * SCLUSTER_SIZE_Y * SCLUSTER_SIZE_Z;
 | 
			
		||||
        int nsclusters = ((nclusters + supercluster_size - 1) / supercluster_size);
 | 
			
		||||
 | 
			
		||||
        for(int scl = 0; scl < nsclusters; scl++) {
 | 
			
		||||
            const int sci = atom->Nsclusters_local;
 | 
			
		||||
            if(sci >= atom->Nsclusters_max) {
 | 
			
		||||
                growSuperClusters(atom);
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            int scl_offset = scl * SCLUSTER_SIZE * CLUSTER_M;
 | 
			
		||||
            MD_FLOAT sc_bbminx = INFINITY, sc_bbmaxx = -INFINITY;
 | 
			
		||||
            MD_FLOAT sc_bbminy = INFINITY, sc_bbmaxy = -INFINITY;
 | 
			
		||||
            MD_FLOAT sc_bbminz = INFINITY, sc_bbmaxz = -INFINITY;
 | 
			
		||||
            atom->siclusters[sci].nclusters = 0;
 | 
			
		||||
 | 
			
		||||
            for(int scl_z = 0; scl_z < SCLUSTER_SIZE_Z; scl_z++) {
 | 
			
		||||
                const int atom_scl_z_offset = scl_offset + scl_z * SCLUSTER_SIZE_Y * SCLUSTER_SIZE_X * CLUSTER_M;
 | 
			
		||||
                const int atom_scl_z_end_idx = MIN(atom_scl_z_offset + SCLUSTER_SIZE_Y * SCLUSTER_SIZE_X * CLUSTER_M - 1, c - 1);
 | 
			
		||||
                sortAtomsByCoord(atom, YY, bin, atom_scl_z_offset, atom_scl_z_end_idx);
 | 
			
		||||
 | 
			
		||||
                for(int scl_y = 0; scl_y < SCLUSTER_SIZE_Y; scl_y++) {
 | 
			
		||||
                    const int atom_scl_y_offset = scl_offset + scl_z * SCLUSTER_SIZE_Y * SCLUSTER_SIZE_X * CLUSTER_M + scl_y * SCLUSTER_SIZE_Y * CLUSTER_M;
 | 
			
		||||
                    const int atom_scl_y_end_idx = MIN(atom_scl_y_offset + SCLUSTER_SIZE_X * CLUSTER_M - 1, c - 1);
 | 
			
		||||
                    sortAtomsByCoord(atom, XX, bin, atom_scl_y_offset, atom_scl_y_end_idx);
 | 
			
		||||
 | 
			
		||||
                    for(int scl_x = 0; scl_x < SCLUSTER_SIZE_X; scl_x++) {
 | 
			
		||||
                        const int cluster_sup_idx = scl_z * SCLUSTER_SIZE_Z * SCLUSTER_SIZE_Y + scl_y * SCLUSTER_SIZE_X + scl_x;
 | 
			
		||||
                        const int ci = atom->Nclusters_local;
 | 
			
		||||
                        if(ci >= atom->Nclusters_max) {
 | 
			
		||||
                            growClusters(atom);
 | 
			
		||||
                        }
 | 
			
		||||
 | 
			
		||||
                        int ci_sca_base = CI_SCALAR_BASE_INDEX(ci);
 | 
			
		||||
                        int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
 | 
			
		||||
                        MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
 | 
			
		||||
                        MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
 | 
			
		||||
 | 
			
		||||
                        int sci_sca_base = SCI_SCALAR_BASE_INDEX(sci);
 | 
			
		||||
                        int sci_vec_base = SCI_VECTOR_BASE_INDEX(sci);
 | 
			
		||||
                        MD_FLOAT *sci_x = &atom->scl_x[sci_vec_base];
 | 
			
		||||
                        MD_FLOAT *sci_v = &atom->scl_v[sci_vec_base];
 | 
			
		||||
 | 
			
		||||
                        int *ci_type = &atom->cl_type[ci_sca_base];
 | 
			
		||||
                        MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
 | 
			
		||||
                        MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
 | 
			
		||||
                        MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
 | 
			
		||||
 | 
			
		||||
                        atom->iclusters[ci].natoms = 0;
 | 
			
		||||
                        for(int cii = 0; cii < CLUSTER_M; cii++) {
 | 
			
		||||
                            if(ac < c) {
 | 
			
		||||
                                int i = bins[bin * atoms_per_bin + ac];
 | 
			
		||||
                                MD_FLOAT xtmp = atom_x(i);
 | 
			
		||||
                                MD_FLOAT ytmp = atom_y(i);
 | 
			
		||||
                                MD_FLOAT ztmp = atom_z(i);
 | 
			
		||||
 | 
			
		||||
                                ci_x[CL_X_OFFSET + cii] = xtmp;
 | 
			
		||||
                                ci_x[CL_Y_OFFSET + cii] = ytmp;
 | 
			
		||||
                                ci_x[CL_Z_OFFSET + cii] = ztmp;
 | 
			
		||||
                                ci_v[CL_X_OFFSET + cii] = atom->vx[i];
 | 
			
		||||
                                ci_v[CL_Y_OFFSET + cii] = atom->vy[i];
 | 
			
		||||
                                ci_v[CL_Z_OFFSET + cii] = atom->vz[i];
 | 
			
		||||
 | 
			
		||||
                                sci_x[SCL_CL_X_OFFSET(atom->siclusters[sci].nclusters) + cii] = xtmp;
 | 
			
		||||
                                sci_x[SCL_CL_Y_OFFSET(atom->siclusters[sci].nclusters) + cii] = ytmp;
 | 
			
		||||
                                sci_x[SCL_CL_Z_OFFSET(atom->siclusters[sci].nclusters) + cii] = ztmp;
 | 
			
		||||
                                sci_v[SCL_CL_X_OFFSET(atom->siclusters[sci].nclusters) + cii] = atom->vx[i];
 | 
			
		||||
                                sci_v[SCL_CL_Y_OFFSET(atom->siclusters[sci].nclusters) + cii] = atom->vy[i];
 | 
			
		||||
                                sci_v[SCL_CL_Z_OFFSET(atom->siclusters[sci].nclusters) + cii] = atom->vz[i];
 | 
			
		||||
 | 
			
		||||
                                // TODO: To create the bounding boxes faster, we can use SIMD operations
 | 
			
		||||
                                if(bbminx > xtmp) { bbminx = xtmp; }
 | 
			
		||||
                                if(bbmaxx < xtmp) { bbmaxx = xtmp; }
 | 
			
		||||
                                if(bbminy > ytmp) { bbminy = ytmp; }
 | 
			
		||||
                                if(bbmaxy < ytmp) { bbmaxy = ytmp; }
 | 
			
		||||
                                if(bbminz > ztmp) { bbminz = ztmp; }
 | 
			
		||||
                                if(bbmaxz < ztmp) { bbmaxz = ztmp; }
 | 
			
		||||
 | 
			
		||||
                                ci_type[cii] = atom->type[i];
 | 
			
		||||
                                atom->iclusters[ci].natoms++;
 | 
			
		||||
                            } else {
 | 
			
		||||
                                ci_x[CL_X_OFFSET + cii] = INFINITY;
 | 
			
		||||
                                ci_x[CL_Y_OFFSET + cii] = INFINITY;
 | 
			
		||||
                                ci_x[CL_Z_OFFSET + cii] = INFINITY;
 | 
			
		||||
 | 
			
		||||
                                sci_x[SCL_CL_X_OFFSET(atom->siclusters[sci].nclusters) + cii] = INFINITY;
 | 
			
		||||
                                sci_x[SCL_CL_Y_OFFSET(atom->siclusters[sci].nclusters) + cii] = INFINITY;
 | 
			
		||||
                                sci_x[SCL_CL_Z_OFFSET(atom->siclusters[sci].nclusters) + cii] = INFINITY;
 | 
			
		||||
                            }
 | 
			
		||||
 | 
			
		||||
                            ac++;
 | 
			
		||||
                        }
 | 
			
		||||
 | 
			
		||||
                        atom->icluster_bin[ci] = bin;
 | 
			
		||||
                        atom->iclusters[ci].bbminx = bbminx;
 | 
			
		||||
                        atom->iclusters[ci].bbmaxx = bbmaxx;
 | 
			
		||||
                        atom->iclusters[ci].bbminy = bbminy;
 | 
			
		||||
                        atom->iclusters[ci].bbmaxy = bbmaxy;
 | 
			
		||||
                        atom->iclusters[ci].bbminz = bbminz;
 | 
			
		||||
                        atom->iclusters[ci].bbmaxz = bbmaxz;
 | 
			
		||||
                        atom->Nclusters_local++;
 | 
			
		||||
 | 
			
		||||
                        // TODO: To create the bounding boxes faster, we can use SIMD operations
 | 
			
		||||
                        if(sc_bbminx > bbminx) { sc_bbminx = bbminx; }
 | 
			
		||||
                        if(sc_bbmaxx < bbmaxx) { sc_bbmaxx = bbmaxx; }
 | 
			
		||||
                        if(sc_bbminy > bbminy) { sc_bbminy = bbminy; }
 | 
			
		||||
                        if(sc_bbmaxy < bbmaxy) { sc_bbmaxy = bbmaxy; }
 | 
			
		||||
                        if(sc_bbminz > bbminz) { sc_bbminz = bbminz; }
 | 
			
		||||
                        if(sc_bbmaxz < bbmaxz) { sc_bbmaxz = bbmaxz; }
 | 
			
		||||
 | 
			
		||||
                        atom->siclusters[sci].nclusters++;
 | 
			
		||||
                        atom->icluster_idx[SCLUSTER_SIZE * sci + cluster_sup_idx] = ci;
 | 
			
		||||
                        //atom->siclusters[sci].iclusters[cluster_sup_idx] = ci;
 | 
			
		||||
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            atom->sicluster_bin[sci] = bin;
 | 
			
		||||
            atom->siclusters[sci].bbminx = sc_bbminx;
 | 
			
		||||
            atom->siclusters[sci].bbmaxx = sc_bbmaxx;
 | 
			
		||||
            atom->siclusters[sci].bbminy = sc_bbminy;
 | 
			
		||||
            atom->siclusters[sci].bbmaxy = sc_bbmaxy;
 | 
			
		||||
            atom->siclusters[sci].bbminz = sc_bbminz;
 | 
			
		||||
            atom->siclusters[sci].bbmaxz = sc_bbmaxz;
 | 
			
		||||
            atom->Nsclusters_local++;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    DEBUG_MESSAGE("buildClustersGPU end\n");
 | 
			
		||||
}
 | 
			
		||||
#endif //USE_SUPER_CLUSTERS
 | 
			
		||||
 | 
			
		||||
void defineJClusters(Atom *atom) {
 | 
			
		||||
    DEBUG_MESSAGE("defineJClusters start\n");
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										180
									
								
								gromacs/pbc.c
									
									
									
									
									
								
							
							
						
						
									
										180
									
								
								gromacs/pbc.c
									
									
									
									
									
								
							@@ -86,6 +86,98 @@ void cpuUpdatePbc(Atom *atom, Parameter *param, int firstUpdate) {
 | 
			
		||||
    DEBUG_MESSAGE("updatePbc end\n");
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* update coordinates of ghost atoms */
 | 
			
		||||
/* uses mapping created in setupPbc */
 | 
			
		||||
void gpuUpdatePbc(Atom *atom, Parameter *param, int firstUpdate) {
 | 
			
		||||
    DEBUG_MESSAGE("gpuUpdatePbc start\n");
 | 
			
		||||
    int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
 | 
			
		||||
    int ncj = atom->Nclusters_local / jfac;
 | 
			
		||||
    MD_FLOAT xprd = param->xprd;
 | 
			
		||||
    MD_FLOAT yprd = param->yprd;
 | 
			
		||||
    MD_FLOAT zprd = param->zprd;
 | 
			
		||||
 | 
			
		||||
    for(int cg = 0; cg < atom->Nclusters_ghost; cg++) {
 | 
			
		||||
        const int cj = ncj + cg;
 | 
			
		||||
        int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
 | 
			
		||||
 | 
			
		||||
        int scj_vec_base = SCJ_VECTOR_BASE_INDEX(cj);
 | 
			
		||||
 | 
			
		||||
        int bmap_vec_base = CJ_VECTOR_BASE_INDEX(atom->border_map[cg]);
 | 
			
		||||
 | 
			
		||||
        int sbmap_vec_base = SCJ_VECTOR_BASE_INDEX(atom->border_map[cg]);
 | 
			
		||||
 | 
			
		||||
        MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
 | 
			
		||||
        MD_FLOAT *bmap_x = &atom->cl_x[bmap_vec_base];
 | 
			
		||||
 | 
			
		||||
        MD_FLOAT *scj_x = &atom->scl_x[scj_vec_base];
 | 
			
		||||
        MD_FLOAT *sbmap_x = &atom->scl_x[sbmap_vec_base];
 | 
			
		||||
 | 
			
		||||
        MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
 | 
			
		||||
        MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
 | 
			
		||||
        MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
 | 
			
		||||
 | 
			
		||||
        MD_FLOAT sbbminx = INFINITY, sbbmaxx = -INFINITY;
 | 
			
		||||
        MD_FLOAT sbbminy = INFINITY, sbbmaxy = -INFINITY;
 | 
			
		||||
        MD_FLOAT sbbminz = INFINITY, sbbmaxz = -INFINITY;
 | 
			
		||||
 | 
			
		||||
        for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
 | 
			
		||||
            MD_FLOAT xtmp = bmap_x[CL_X_OFFSET + cjj] + atom->PBCx[cg] * xprd;
 | 
			
		||||
            MD_FLOAT ytmp = bmap_x[CL_Y_OFFSET + cjj] + atom->PBCy[cg] * yprd;
 | 
			
		||||
            MD_FLOAT ztmp = bmap_x[CL_Z_OFFSET + cjj] + atom->PBCz[cg] * zprd;
 | 
			
		||||
 | 
			
		||||
            MD_FLOAT sxtmp = sbmap_x[CL_X_OFFSET + cjj] + atom->PBCx[cg] * xprd;
 | 
			
		||||
            MD_FLOAT sytmp = sbmap_x[CL_Y_OFFSET + cjj] + atom->PBCy[cg] * yprd;
 | 
			
		||||
            MD_FLOAT sztmp = sbmap_x[CL_Z_OFFSET + cjj] + atom->PBCz[cg] * zprd;
 | 
			
		||||
 | 
			
		||||
            cj_x[CL_X_OFFSET + cjj] = xtmp;
 | 
			
		||||
            cj_x[CL_Y_OFFSET + cjj] = ytmp;
 | 
			
		||||
            cj_x[CL_Z_OFFSET + cjj] = ztmp;
 | 
			
		||||
 | 
			
		||||
            scj_x[SCL_X_OFFSET + cjj] = sxtmp;
 | 
			
		||||
            scj_x[SCL_Y_OFFSET + cjj] = sytmp;
 | 
			
		||||
            scj_x[SCL_Z_OFFSET + cjj] = sztmp;
 | 
			
		||||
 | 
			
		||||
            if(firstUpdate) {
 | 
			
		||||
                // TODO: To create the bounding boxes faster, we can use SIMD operations
 | 
			
		||||
                if(bbminx > xtmp) { bbminx = xtmp; }
 | 
			
		||||
                if(bbmaxx < xtmp) { bbmaxx = xtmp; }
 | 
			
		||||
                if(bbminy > ytmp) { bbminy = ytmp; }
 | 
			
		||||
                if(bbmaxy < ytmp) { bbmaxy = ytmp; }
 | 
			
		||||
                if(bbminz > ztmp) { bbminz = ztmp; }
 | 
			
		||||
                if(bbmaxz < ztmp) { bbmaxz = ztmp; }
 | 
			
		||||
 | 
			
		||||
                if(sbbminx > sxtmp) { sbbminx = sxtmp; }
 | 
			
		||||
                if(sbbmaxx < sxtmp) { sbbmaxx = sxtmp; }
 | 
			
		||||
                if(sbbminy > sytmp) { sbbminy = sytmp; }
 | 
			
		||||
                if(sbbmaxy < sytmp) { sbbmaxy = sytmp; }
 | 
			
		||||
                if(sbbminz > sztmp) { sbbminz = sztmp; }
 | 
			
		||||
                if(sbbmaxz < sztmp) { sbbmaxz = sztmp; }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        if(firstUpdate) {
 | 
			
		||||
            for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) {
 | 
			
		||||
                cj_x[CL_X_OFFSET + cjj] = INFINITY;
 | 
			
		||||
                cj_x[CL_Y_OFFSET + cjj] = INFINITY;
 | 
			
		||||
                cj_x[CL_Z_OFFSET + cjj] = INFINITY;
 | 
			
		||||
 | 
			
		||||
                scj_x[SCL_X_OFFSET + cjj] = INFINITY;
 | 
			
		||||
                scj_x[SCL_Y_OFFSET + cjj] = INFINITY;
 | 
			
		||||
                scj_x[SCL_Z_OFFSET + cjj] = INFINITY;
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            atom->jclusters[cj].bbminx = bbminx;
 | 
			
		||||
            atom->jclusters[cj].bbmaxx = bbmaxx;
 | 
			
		||||
            atom->jclusters[cj].bbminy = bbminy;
 | 
			
		||||
            atom->jclusters[cj].bbmaxy = bbmaxy;
 | 
			
		||||
            atom->jclusters[cj].bbminz = bbminz;
 | 
			
		||||
            atom->jclusters[cj].bbmaxz = bbmaxz;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    DEBUG_MESSAGE("gpuUpdatePbc end\n");
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/* relocate atoms that have left domain according
 | 
			
		||||
 * to periodic boundary conditions */
 | 
			
		||||
void updateAtomsPbc(Atom *atom, Parameter *param) {
 | 
			
		||||
@@ -229,3 +321,91 @@ void setupPbc(Atom *atom, Parameter *param) {
 | 
			
		||||
    cpuUpdatePbc(atom, param, 1);
 | 
			
		||||
    DEBUG_MESSAGE("setupPbc end\n");
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void setupPbcGPU(Atom *atom, Parameter *param) {
 | 
			
		||||
    DEBUG_MESSAGE("setupPbcGPU start\n");
 | 
			
		||||
    MD_FLOAT xprd = param->xprd;
 | 
			
		||||
    MD_FLOAT yprd = param->yprd;
 | 
			
		||||
    MD_FLOAT zprd = param->zprd;
 | 
			
		||||
    MD_FLOAT Cutneigh = param->cutneigh;
 | 
			
		||||
    //int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
 | 
			
		||||
    int jfac = SCLUSTER_M / CLUSTER_M;
 | 
			
		||||
    int ncj = atom->Nsclusters_local * jfac;
 | 
			
		||||
    int Nghost = -1;
 | 
			
		||||
    int Nghost_atoms = 0;
 | 
			
		||||
 | 
			
		||||
    for(int cj = 0; cj < ncj; cj++) {
 | 
			
		||||
        if(atom->jclusters[cj].natoms > 0) {
 | 
			
		||||
            if(atom->Nsclusters_local + (Nghost + (jfac - 1) + 7) / jfac >= atom->Nclusters_max) {
 | 
			
		||||
                growClusters(atom);
 | 
			
		||||
                //growSuperClusters(atom);
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            if((Nghost + 7) * CLUSTER_M >= NmaxGhost) {
 | 
			
		||||
                growPbc(atom);
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            MD_FLOAT bbminx = atom->jclusters[cj].bbminx;
 | 
			
		||||
            MD_FLOAT bbmaxx = atom->jclusters[cj].bbmaxx;
 | 
			
		||||
            MD_FLOAT bbminy = atom->jclusters[cj].bbminy;
 | 
			
		||||
            MD_FLOAT bbmaxy = atom->jclusters[cj].bbmaxy;
 | 
			
		||||
            MD_FLOAT bbminz = atom->jclusters[cj].bbminz;
 | 
			
		||||
            MD_FLOAT bbmaxz = atom->jclusters[cj].bbmaxz;
 | 
			
		||||
 | 
			
		||||
            /* Setup ghost atoms */
 | 
			
		||||
            /* 6 planes */
 | 
			
		||||
            if (bbminx < Cutneigh)         { ADDGHOST(+1,0,0); }
 | 
			
		||||
            if (bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,0,0); }
 | 
			
		||||
            if (bbminy < Cutneigh)         { ADDGHOST(0,+1,0); }
 | 
			
		||||
            if (bbmaxy >= (yprd-Cutneigh)) { ADDGHOST(0,-1,0); }
 | 
			
		||||
            if (bbminz < Cutneigh)         { ADDGHOST(0,0,+1); }
 | 
			
		||||
            if (bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,0,-1); }
 | 
			
		||||
            /* 8 corners */
 | 
			
		||||
            if (bbminx < Cutneigh         && bbminy < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(+1,+1,+1); }
 | 
			
		||||
            if (bbminx < Cutneigh         && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(+1,-1,+1); }
 | 
			
		||||
            if (bbminx < Cutneigh         && bbminy < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); }
 | 
			
		||||
            if (bbminx < Cutneigh         && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,-1,-1); }
 | 
			
		||||
            if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(-1,+1,+1); }
 | 
			
		||||
            if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(-1,-1,+1); }
 | 
			
		||||
            if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,+1,-1); }
 | 
			
		||||
            if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,-1,-1); }
 | 
			
		||||
            /* 12 edges */
 | 
			
		||||
            if (bbminx < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(+1,0,+1); }
 | 
			
		||||
            if (bbminx < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,0,-1); }
 | 
			
		||||
            if (bbmaxx >= (xprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(-1,0,+1); }
 | 
			
		||||
            if (bbmaxx >= (xprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,0,-1); }
 | 
			
		||||
            if (bbminy < Cutneigh         && bbminz < Cutneigh)         { ADDGHOST(0,+1,+1); }
 | 
			
		||||
            if (bbminy < Cutneigh         && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,+1,-1); }
 | 
			
		||||
            if (bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh)         { ADDGHOST(0,-1,+1); }
 | 
			
		||||
            if (bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,-1,-1); }
 | 
			
		||||
            if (bbminy < Cutneigh         && bbminx < Cutneigh)         { ADDGHOST(+1,+1,0); }
 | 
			
		||||
            if (bbminy < Cutneigh         && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,+1,0); }
 | 
			
		||||
            if (bbmaxy >= (yprd-Cutneigh) && bbminx < Cutneigh)         { ADDGHOST(+1,-1,0); }
 | 
			
		||||
            if (bbmaxy >= (yprd-Cutneigh) && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if(ncj + (Nghost + (jfac - 1) + 1) / jfac >= atom->Nclusters_max) {
 | 
			
		||||
        growClusters(atom);
 | 
			
		||||
        //growSuperClusters(atom);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // Add dummy cluster at the end
 | 
			
		||||
    int cj_vec_base = CJ_VECTOR_BASE_INDEX(ncj + Nghost + 1);
 | 
			
		||||
    MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
 | 
			
		||||
    for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
 | 
			
		||||
        cj_x[CL_X_OFFSET + cjj] = INFINITY;
 | 
			
		||||
        cj_x[CL_Y_OFFSET + cjj] = INFINITY;
 | 
			
		||||
        cj_x[CL_Z_OFFSET + cjj] = INFINITY;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // increase by one to make it the ghost atom count
 | 
			
		||||
    atom->dummy_cj = ncj + Nghost + 1;
 | 
			
		||||
    atom->Nghost = Nghost_atoms;
 | 
			
		||||
    atom->Nclusters_ghost = Nghost + 1;
 | 
			
		||||
    atom->Nclusters = atom->Nclusters_local + Nghost + 1;
 | 
			
		||||
 | 
			
		||||
    // Update created ghost clusters positions
 | 
			
		||||
    gpuUpdatePbc(atom, param, 1);
 | 
			
		||||
    DEBUG_MESSAGE("setupPbcGPU end\n");
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -13,7 +13,7 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest
 | 
			
		||||
    MEM_TRACER_INIT;
 | 
			
		||||
    INDEX_TRACER_INIT;
 | 
			
		||||
    int Nlocal = atom->Nlocal;
 | 
			
		||||
    NeighborCluster* neighs;
 | 
			
		||||
    int* neighs;
 | 
			
		||||
    //MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
 | 
			
		||||
 | 
			
		||||
    INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
 | 
			
		||||
@@ -34,8 +34,7 @@ void traceAddresses(Parameter *param, Atom *atom, Neighbor *neighbor, int timest
 | 
			
		||||
        DIST_TRACE(neighs, numneighs);
 | 
			
		||||
 | 
			
		||||
        for(int k = 0; k < numneighs; k++) {
 | 
			
		||||
            int j = neighs[k].cj;
 | 
			
		||||
            MEM_TRACE(j, 'R');
 | 
			
		||||
            MEM_TRACE(neighs[k], 'R');
 | 
			
		||||
            MEM_TRACE(atom_x(j), 'R');
 | 
			
		||||
            MEM_TRACE(atom_y(j), 'R');
 | 
			
		||||
            MEM_TRACE(atom_z(j), 'R');
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										332
									
								
								gromacs/utils.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										332
									
								
								gromacs/utils.c
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,332 @@
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * Temporal functions for debugging, remove before proceeding with pull request
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
#include <stdio.h>
 | 
			
		||||
#include <stdlib.h>
 | 
			
		||||
#include <utils.h>
 | 
			
		||||
 | 
			
		||||
extern void alignDataToSuperclusters(Atom *atom);
 | 
			
		||||
extern void alignDataFromSuperclusters(Atom *atom);
 | 
			
		||||
 | 
			
		||||
#ifdef USE_SUPER_CLUSTERS
 | 
			
		||||
/*
 | 
			
		||||
void verifyClusters(Atom *atom) {
 | 
			
		||||
    unsigned int count = 0;
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < atom->Nsclusters_local; i++) {
 | 
			
		||||
        for (int j = 0; j < atom->siclusters[i].nclusters; j++) {
 | 
			
		||||
            for(int cii = 0; cii < CLUSTER_M; cii++, count++);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    MD_FLOAT *x = malloc(count * sizeof(MD_FLOAT));
 | 
			
		||||
    MD_FLOAT *y = malloc(count * sizeof(MD_FLOAT));
 | 
			
		||||
    MD_FLOAT *z = malloc(count * sizeof(MD_FLOAT));
 | 
			
		||||
 | 
			
		||||
    count = 0;
 | 
			
		||||
    unsigned int diffs = 0;
 | 
			
		||||
 | 
			
		||||
    printf("######### %d #########\r\n", atom->Nsclusters_local);
 | 
			
		||||
    for (int i = 0; i < atom->Nsclusters_local; i++) {
 | 
			
		||||
        printf("######### %d\t #########\r\n", atom->siclusters[i].nclusters);
 | 
			
		||||
 | 
			
		||||
        for (int j = 0; j < atom->siclusters[i].nclusters; j++) {
 | 
			
		||||
            //printf("%d\t", atom.siclusters[i].iclusters[j]);
 | 
			
		||||
            MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[i].iclusters[j])];
 | 
			
		||||
 | 
			
		||||
            if (atom->iclusters[atom->siclusters[i].iclusters[j]].bbminx < atom->siclusters[i].bbminx ||
 | 
			
		||||
            atom->iclusters[atom->siclusters[i].iclusters[j]].bbmaxx > atom->siclusters[i].bbmaxx ||
 | 
			
		||||
            atom->iclusters[atom->siclusters[i].iclusters[j]].bbminy < atom->siclusters[i].bbminy ||
 | 
			
		||||
            atom->iclusters[atom->siclusters[i].iclusters[j]].bbmaxy > atom->siclusters[i].bbmaxy ||
 | 
			
		||||
            atom->iclusters[atom->siclusters[i].iclusters[j]].bbminz < atom->siclusters[i].bbminz ||
 | 
			
		||||
            atom->iclusters[atom->siclusters[i].iclusters[j]].bbmaxz > atom->siclusters[i].bbmaxz) diffs++;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
            for(int cii = 0; cii < CLUSTER_M; cii++, count++) {
 | 
			
		||||
                x[count] = ci_x[CL_X_OFFSET + cii];
 | 
			
		||||
                y[count] = ci_x[CL_Y_OFFSET + cii];
 | 
			
		||||
                z[count] = ci_x[CL_Z_OFFSET + cii];
 | 
			
		||||
                //printf("x: %f\ty: %f\tz: %f\r\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        printf("######### \t #########\r\n");
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    printf("######### Diffs: %d\t #########\r\n", diffs);
 | 
			
		||||
 | 
			
		||||
    printf("\r\n");
 | 
			
		||||
 | 
			
		||||
    count = 0;
 | 
			
		||||
    diffs = 0;
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < atom->Nclusters_local; i++) {
 | 
			
		||||
        MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(i)];
 | 
			
		||||
 | 
			
		||||
        for(int cii = 0; cii < CLUSTER_M; cii++, count++) {
 | 
			
		||||
            if (ci_x[CL_X_OFFSET + cii] != x[count] ||
 | 
			
		||||
                ci_x[CL_Y_OFFSET + cii] != y[count] ||
 | 
			
		||||
                ci_x[CL_Z_OFFSET + cii] != z[count]) diffs++;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    printf("######### Diffs: %d\t #########\r\n", diffs);
 | 
			
		||||
}
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
void verifyLayout(Atom *atom) {
 | 
			
		||||
 | 
			
		||||
    printf("verifyLayout start\r\n");
 | 
			
		||||
 | 
			
		||||
    /*
 | 
			
		||||
    unsigned int count = 0;
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < atom->Nsclusters_local; i++) {
 | 
			
		||||
        for (int j = 0; j < atom->siclusters[i].nclusters; j++, count++);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    MD_FLOAT *scl_x = malloc(atom->Nsclusters_local * SCLUSTER_SIZE * 3 * CLUSTER_M * sizeof(MD_FLOAT));
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
 | 
			
		||||
        const unsigned int scl_offset = sci * SCLUSTER_SIZE * 3 * CLUSTER_M;
 | 
			
		||||
 | 
			
		||||
        for (int ci = 0, scci = scl_offset; ci < atom->siclusters[sci].nclusters; ci++, scci += CLUSTER_M) {
 | 
			
		||||
            MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
 | 
			
		||||
 | 
			
		||||
            const unsigned int atom_offset = scci;
 | 
			
		||||
 | 
			
		||||
            /*
 | 
			
		||||
            for(int cii = 0, scii = atom_offset; cii < CLUSTER_M; cii++, scii += 3) {
 | 
			
		||||
                scl_x[CL_X_OFFSET + scii] = ci_x[CL_X_OFFSET + cii];
 | 
			
		||||
                scl_x[CL_Y_OFFSET + scii] = ci_x[CL_Y_OFFSET + cii];
 | 
			
		||||
                scl_x[CL_Z_OFFSET + scii] = ci_x[CL_Z_OFFSET + cii];
 | 
			
		||||
                //printf("x: %f\ty: %f\tz: %f\r\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
            memcpy(&scl_x[atom_offset], &ci_x[0], CLUSTER_M * sizeof(MD_FLOAT));
 | 
			
		||||
            memcpy(&scl_x[atom_offset + SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
 | 
			
		||||
            memcpy(&scl_x[atom_offset + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
 | 
			
		||||
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    */
 | 
			
		||||
    //alignDataToSuperclusters(atom);
 | 
			
		||||
 | 
			
		||||
    //for (int sci = 0; sci < 2; sci++) {
 | 
			
		||||
    for (int sci = 4; sci < 6; sci++) {
 | 
			
		||||
        const unsigned int scl_offset = sci * SCLUSTER_SIZE;
 | 
			
		||||
 | 
			
		||||
        MD_FLOAT *sci_x = &atom->scl_f[SCI_VECTOR_BASE_INDEX(sci)];
 | 
			
		||||
 | 
			
		||||
        for (int cii = 0; cii < SCLUSTER_M; ++cii) {
 | 
			
		||||
 | 
			
		||||
            const unsigned int cl_idx = cii / CLUSTER_M;
 | 
			
		||||
            const unsigned int ciii = cii % CLUSTER_M;
 | 
			
		||||
 | 
			
		||||
            /*
 | 
			
		||||
            printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[cii],
 | 
			
		||||
                   sci_x[cii + SCLUSTER_SIZE * CLUSTER_M], sci_x[cii + 2 * SCLUSTER_SIZE * CLUSTER_M]);
 | 
			
		||||
            */
 | 
			
		||||
 | 
			
		||||
            printf("%d\t%d\t%f\t%f\t%f\r\n", atom->icluster_idx[SCLUSTER_SIZE * sci + cl_idx], cl_idx, sci_x[SCL_CL_X_OFFSET(cl_idx) + ciii],
 | 
			
		||||
                   sci_x[SCL_CL_Y_OFFSET(cl_idx) + ciii], sci_x[SCL_CL_Z_OFFSET(cl_idx) + ciii]);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        /*
 | 
			
		||||
        //for (int cii = 0; cii < SCLUSTER_M; ++cii) {
 | 
			
		||||
        for (int cii = 0; cii < SCLUSTER_M; ++cii) {
 | 
			
		||||
 | 
			
		||||
            const unsigned int cl_idx = cii / CLUSTER_M;
 | 
			
		||||
            const unsigned int ciii = cii % CLUSTER_M;
 | 
			
		||||
 | 
			
		||||
            /*
 | 
			
		||||
            printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[SCL_X_OFFSET(cl_idx) + cii],
 | 
			
		||||
                   sci_x[SCL_Y_OFFSET(cl_idx) + cii], sci_x[SCL_Z_OFFSET(cl_idx) + cii]);
 | 
			
		||||
                   */
 | 
			
		||||
 | 
			
		||||
        /*
 | 
			
		||||
            printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[SCL_X_OFFSET(cl_idx) + ciii],
 | 
			
		||||
                   sci_x[SCL_Y_OFFSET(cl_idx) + ciii], sci_x[SCL_Z_OFFSET(cl_idx) + ciii]);
 | 
			
		||||
        }
 | 
			
		||||
        */
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        /*
 | 
			
		||||
        for (int scii = scl_offset; scii < scl_offset + SCLUSTER_SIZE; scii++) {
 | 
			
		||||
 | 
			
		||||
            for (int cii = 0; cii < CLUSTER_M; ++cii) {
 | 
			
		||||
                printf("%f\t%f\t%f\r\n", sci_x[SCL_X_OFFSET(scii) + cii],
 | 
			
		||||
                       sci_x[SCL_Y_OFFSET(scii) + cii], sci_x[SCL_Z_OFFSET(scii) + cii]);
 | 
			
		||||
            }
 | 
			
		||||
            /*
 | 
			
		||||
 | 
			
		||||
            const unsigned int cl_offset = scii * 3 * CLUSTER_M;
 | 
			
		||||
            //MD_FLOAT *sci_x = &scl_x[CI_VECTOR_BASE_INDEX(scii)];
 | 
			
		||||
 | 
			
		||||
            for (int cii = cl_offset; cii < cl_offset + CLUSTER_M; ++cii) {
 | 
			
		||||
                printf("%f\t%f\t%f\r\n", sci_x[CL_X_OFFSET + cii],
 | 
			
		||||
                       sci_x[CL_Y_OFFSET + cii], sci_x[CL_Z_OFFSET + cii]);
 | 
			
		||||
            }
 | 
			
		||||
            */
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
        /*
 | 
			
		||||
        for (int cii = cl_offset; cii < cl_offset + CLUSTER_M; ++cii) {
 | 
			
		||||
            printf("%f\t%f\t%f\r\n", scl_x[CL_X_OFFSET + cii],
 | 
			
		||||
                   scl_x[CL_Y_OFFSET + cii], scl_x[CL_Z_OFFSET + cii]);
 | 
			
		||||
        }
 | 
			
		||||
        */
 | 
			
		||||
 | 
			
		||||
        //}
 | 
			
		||||
 | 
			
		||||
        printf("##########\t##########\r\n");
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    printf("\r\n");
 | 
			
		||||
 | 
			
		||||
    //for (int ci = 0; ci < 16; ci++) {
 | 
			
		||||
    for (int ci = 35; ci < 37; ci++) {
 | 
			
		||||
        printf("$$$$$$$$$$\t%d\t%d\t$$$$$$$$$$\r\n", ci, atom->icluster_bin[ci]);
 | 
			
		||||
        MD_FLOAT *ci_x = &atom->cl_f[CI_VECTOR_BASE_INDEX(ci)];
 | 
			
		||||
 | 
			
		||||
        //for(int cii = 0; cii < CLUSTER_M; cii++, count++) {
 | 
			
		||||
        for(int cii = 0; cii < CLUSTER_M; cii++) {
 | 
			
		||||
 | 
			
		||||
            printf("%f\t%f\t%f\r\n", ci_x[CL_X_OFFSET + cii],
 | 
			
		||||
                   ci_x[CL_Y_OFFSET + cii],
 | 
			
		||||
                   ci_x[CL_Z_OFFSET + cii]);
 | 
			
		||||
        }
 | 
			
		||||
        printf("##########\t##########\r\n");
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    printf("verifyLayout end\r\n");
 | 
			
		||||
 | 
			
		||||
    /*
 | 
			
		||||
    for (int i = 0; i < atom->Nclusters_local; i++) {
 | 
			
		||||
        MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(i)];
 | 
			
		||||
 | 
			
		||||
        for(int cii = 0; cii < CLUSTER_M; cii++, count++) {
 | 
			
		||||
            if (ci_x[CL_X_OFFSET + cii] != x[count] ||
 | 
			
		||||
                ci_x[CL_Y_OFFSET + cii] != y[count] ||
 | 
			
		||||
                ci_x[CL_Z_OFFSET + cii] != z[count]) diffs++;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
     */
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void checkAlignment(Atom *atom) {
 | 
			
		||||
    alignDataToSuperclusters(atom);
 | 
			
		||||
 | 
			
		||||
    for (int sci = 4; sci < 6; sci++) {
 | 
			
		||||
        MD_FLOAT *sci_x = &atom->scl_x[SCI_VECTOR_BASE_INDEX(sci)];
 | 
			
		||||
 | 
			
		||||
        for (int cii = 0; cii < SCLUSTER_M; ++cii) {
 | 
			
		||||
 | 
			
		||||
            const unsigned int cl_idx = cii / CLUSTER_M;
 | 
			
		||||
            const unsigned int ciii = cii % CLUSTER_M;
 | 
			
		||||
 | 
			
		||||
            printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[SCL_CL_X_OFFSET(cl_idx) + ciii],
 | 
			
		||||
                   sci_x[SCL_CL_Y_OFFSET(cl_idx) + ciii], sci_x[SCL_CL_Z_OFFSET(cl_idx) + ciii]);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    for (int ci = 35; ci < 37; ci++) {
 | 
			
		||||
        printf("$$$$$$$$$$\t%d\t%d\t$$$$$$$$$$\r\n", ci, atom->icluster_bin[ci]);
 | 
			
		||||
        MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(ci)];
 | 
			
		||||
 | 
			
		||||
        for(int cii = 0; cii < CLUSTER_M; cii++) {
 | 
			
		||||
 | 
			
		||||
            printf("%f\t%f\t%f\r\n", ci_x[CL_X_OFFSET + cii],
 | 
			
		||||
                   ci_x[CL_Y_OFFSET + cii],
 | 
			
		||||
                   ci_x[CL_Z_OFFSET + cii]);
 | 
			
		||||
        }
 | 
			
		||||
        printf("##########\t##########\r\n");
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void showSuperclusters(Atom *atom) {
 | 
			
		||||
    for (int sci = 4; sci < 6; sci++) {
 | 
			
		||||
        MD_FLOAT *sci_x = &atom->scl_x[SCI_VECTOR_BASE_INDEX(sci)];
 | 
			
		||||
 | 
			
		||||
        for (int cii = 0; cii < SCLUSTER_M; ++cii) {
 | 
			
		||||
 | 
			
		||||
            const unsigned int cl_idx = cii / CLUSTER_M;
 | 
			
		||||
            const unsigned int ciii = cii % CLUSTER_M;
 | 
			
		||||
 | 
			
		||||
            printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[SCL_CL_X_OFFSET(cl_idx) + ciii],
 | 
			
		||||
                   sci_x[SCL_CL_Y_OFFSET(cl_idx) + ciii], sci_x[SCL_CL_Z_OFFSET(cl_idx) + ciii]);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void printNeighs(Atom *atom, Neighbor *neighbor) {
 | 
			
		||||
    for (int i = 0; i < atom->Nclusters_local; ++i) {
 | 
			
		||||
        int neigh_num = neighbor->numneigh[i];
 | 
			
		||||
        for (int j = 0; j < neigh_num; j++) {
 | 
			
		||||
            printf("%d ", neighbor->neighbors[ i * neighbor->maxneighs + j]);
 | 
			
		||||
        }
 | 
			
		||||
        printf("\r\n");
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void printClusterIndices(Atom *atom) {
 | 
			
		||||
    for (int i = 0; i < atom->Nsclusters_local; ++i) {
 | 
			
		||||
        int clusters_num = atom->siclusters[i].nclusters;
 | 
			
		||||
        for (int j = 0; j < clusters_num; j++) {
 | 
			
		||||
            printf("%d ", atom->icluster_idx[j + SCLUSTER_SIZE * i]);
 | 
			
		||||
        }
 | 
			
		||||
        printf("\r\n");
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void verifyNeigh(Atom *atom, Neighbor *neighbor) {
 | 
			
		||||
 | 
			
		||||
    buildNeighbor(atom, neighbor);
 | 
			
		||||
    int *numneigh = (int*) malloc(atom->Nclusters_local * sizeof(int));
 | 
			
		||||
    int *neighbors = (int*) malloc(atom->Nclusters_local * neighbor->maxneighs * sizeof(int*));
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < atom->Nclusters_local; ++i) {
 | 
			
		||||
        int neigh_num = neighbor->numneigh[i];
 | 
			
		||||
        numneigh[i] = neighbor->numneigh[i];
 | 
			
		||||
        neighbor->numneigh[i] = 0;
 | 
			
		||||
        for (int j = 0; j < neigh_num; j++) {
 | 
			
		||||
            neighbors[i * neighbor->maxneighs + j] = neighbor->neighbors[i * neighbor->maxneighs + j];
 | 
			
		||||
            neighbor->neighbors[i * neighbor->maxneighs + j] = 0;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    buildNeighborGPU(atom, neighbor);
 | 
			
		||||
 | 
			
		||||
    unsigned int num_diff = 0;
 | 
			
		||||
    unsigned int neigh_diff = 0;
 | 
			
		||||
 | 
			
		||||
    for (int i = 0; i < atom->Nclusters_local; ++i) {
 | 
			
		||||
        int neigh_num = neighbor->numneigh[i];
 | 
			
		||||
        if (numneigh[i] != neigh_num) num_diff++;
 | 
			
		||||
        for (int j = 0; j < neigh_num; j++) {
 | 
			
		||||
            if (neighbors[i * neighbor->maxneighs + j] !=
 | 
			
		||||
            neighbor->neighbors[ i * neighbor->maxneighs + j]) neigh_diff++;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    printf("%d\t%d\r\n", num_diff, neigh_diff);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#endif //USE_SUPER_CLUSTERS
 | 
			
		||||
@@ -15,8 +15,61 @@ void write_data_to_vtk_file(const char *filename, Atom* atom, int timestep) {
 | 
			
		||||
    write_ghost_atoms_to_vtk_file(filename, atom, timestep);
 | 
			
		||||
    write_local_cluster_edges_to_vtk_file(filename, atom, timestep);
 | 
			
		||||
    write_ghost_cluster_edges_to_vtk_file(filename, atom, timestep);
 | 
			
		||||
#ifdef USE_SUPER_CLUSTERS
 | 
			
		||||
    write_super_clusters_to_vtk_file(filename, atom, timestep);
 | 
			
		||||
#endif //#ifdef USE_SUPER_CLUSTERS
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#ifdef USE_SUPER_CLUSTERS
 | 
			
		||||
int write_super_clusters_to_vtk_file(const char* filename, Atom* atom, int timestep) {
 | 
			
		||||
    char timestep_filename[128];
 | 
			
		||||
    snprintf(timestep_filename, sizeof timestep_filename, "%s_sup_%d.vtk", filename, timestep);
 | 
			
		||||
    FILE* fp = fopen(timestep_filename, "wb");
 | 
			
		||||
 | 
			
		||||
    if(fp == NULL) {
 | 
			
		||||
        fprintf(stderr, "Could not open VTK file for writing!\n");
 | 
			
		||||
        return -1;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fprintf(fp, "# vtk DataFile Version 2.0\n");
 | 
			
		||||
    fprintf(fp, "Particle data\n");
 | 
			
		||||
    fprintf(fp, "ASCII\n");
 | 
			
		||||
    fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
 | 
			
		||||
    fprintf(fp, "POINTS %d double\n", atom->Nsclusters_local * SCLUSTER_M);
 | 
			
		||||
    for(int ci = 0; ci < atom->Nsclusters_local; ++ci) {
 | 
			
		||||
 | 
			
		||||
        int factor = (rand() % 1000) + 1;
 | 
			
		||||
        //double factor = ci * 10;
 | 
			
		||||
 | 
			
		||||
        int ci_vec_base = SCI_VECTOR_BASE_INDEX(ci);
 | 
			
		||||
        MD_FLOAT *ci_x = &atom->scl_x[ci_vec_base];
 | 
			
		||||
        for(int cii = 0; cii < SCLUSTER_M; ++cii) {
 | 
			
		||||
            fprintf(fp, "%.4f %.4f %.4f\n", ci_x[SCL_X_OFFSET + cii] * factor, ci_x[SCL_Y_OFFSET + cii] * factor, ci_x[SCL_Z_OFFSET + cii] * factor);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
    fprintf(fp, "\n\n");
 | 
			
		||||
    fprintf(fp, "CELLS %d %d\n", atom->Nlocal, atom->Nlocal * 2);
 | 
			
		||||
    for(int i = 0; i < atom->Nlocal; ++i) {
 | 
			
		||||
        fprintf(fp, "1 %d\n", i);
 | 
			
		||||
    }
 | 
			
		||||
    fprintf(fp, "\n\n");
 | 
			
		||||
    fprintf(fp, "CELL_TYPES %d\n", atom->Nlocal);
 | 
			
		||||
    for(int i = 0; i < atom->Nlocal; ++i) {
 | 
			
		||||
        fprintf(fp, "1\n");
 | 
			
		||||
    }
 | 
			
		||||
    fprintf(fp, "\n\n");
 | 
			
		||||
    fprintf(fp, "POINT_DATA %d\n", atom->Nlocal);
 | 
			
		||||
    fprintf(fp, "SCALARS mass double\n");
 | 
			
		||||
    fprintf(fp, "LOOKUP_TABLE default\n");
 | 
			
		||||
    for(int i = 0; i < atom->Nlocal; i++) {
 | 
			
		||||
        fprintf(fp, "1.0\n");
 | 
			
		||||
    }
 | 
			
		||||
    fprintf(fp, "\n\n");
 | 
			
		||||
    fclose(fp);
 | 
			
		||||
    return 0;
 | 
			
		||||
}
 | 
			
		||||
#endif //USE_SUPER_CLUSTERS
 | 
			
		||||
 | 
			
		||||
int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) {
 | 
			
		||||
    char timestep_filename[128];
 | 
			
		||||
    snprintf(timestep_filename, sizeof timestep_filename, "%s_local_%d.vtk", filename, timestep);
 | 
			
		||||
 
 | 
			
		||||
@@ -7,7 +7,6 @@ ANSI_CFLAGS += -pedantic
 | 
			
		||||
ANSI_CFLAGS += -Wextra
 | 
			
		||||
 | 
			
		||||
CFLAGS   = -Ofast -march=native $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
 | 
			
		||||
#CFLAGS   = -Ofast -march=core-avx2 $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
 | 
			
		||||
#CFLAGS   = -O3 -march=cascadelake $(ANSI_CFLAGS) #-Xpreprocessor -fopenmp -g
 | 
			
		||||
#CFLAGS   = -Ofast $(ANSI_CFLAGS) -g #-Xpreprocessor -fopenmp -g
 | 
			
		||||
ASFLAGS  = -masm=intel
 | 
			
		||||
 
 | 
			
		||||
@@ -6,29 +6,13 @@ ANSI_CFLAGS += -std=c99
 | 
			
		||||
ANSI_CFLAGS += -pedantic
 | 
			
		||||
ANSI_CFLAGS += -Wextra
 | 
			
		||||
 | 
			
		||||
ifeq ($(ISA),AVX512)
 | 
			
		||||
CFLAGS   = -Ofast -mavx512f -mavx512vl -mavx512bw -mavx512dq -mavx512cd -ffast-math -funroll-loops # -fopenmp
 | 
			
		||||
#CFLAGS   = -O0 -g  -std=c99 -fargument-noalias
 | 
			
		||||
#CFLAGS   = -O3 -march=cascadelake  -ffast-math -funroll-loops # -fopenmp
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
ifeq ($(ISA),AVX2)
 | 
			
		||||
CFLAGS   = -Ofast -march=native -ffast-math -funroll-loops # -fopenmp
 | 
			
		||||
#CFLAGS   = -Ofast -march=native -mavx2  -ffast-math -funroll-loops # -fopenmp
 | 
			
		||||
#CFLAGS   = -O3 -march=znver1  -ffast-math -funroll-loops # -fopenmp
 | 
			
		||||
#CFLAGS   = -Ofast -mavx2 -ffast-math -funroll-loops # -fopenmp
 | 
			
		||||
CFLAGS   = -Ofast -mavx2 -mfma -ffast-math -funroll-loops # -fopenmp
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
ifeq ($(ISA),AVX)
 | 
			
		||||
CFLAGS   = -Ofast -mavx -ffast-math -funroll-loops # -fopenmp
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
ifeq ($(ISA),SSE)
 | 
			
		||||
CFLAGS   = -Ofast -msse4.2 -ffast-math -funroll-loops # -fopenmp
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
#CFLAGS   = -O0 -g -std=c99 -fargument-noalias
 | 
			
		||||
#CFLAGS   = -Ofast -march=native -ffast-math -funroll-loops # -fopenmp
 | 
			
		||||
#CFLAGS   = -Ofast -march=native  -ffast-math -funroll-loops # -fopenmp
 | 
			
		||||
#CFLAGS   = -O3 -march=native  -ffast-math -funroll-loops # -fopenmp
 | 
			
		||||
#CFLAGS   = -O3 -march=znver1  -ffast-math -funroll-loops # -fopenmp
 | 
			
		||||
ASFLAGS  =  #-masm=intel
 | 
			
		||||
LFLAGS   =
 | 
			
		||||
DEFINES  = -D_GNU_SOURCE -DNO_ZMM_INTRIN
 | 
			
		||||
 
 | 
			
		||||
@@ -3,25 +3,11 @@ LINKER = $(CC)
 | 
			
		||||
 | 
			
		||||
OPENMP  = #-qopenmp
 | 
			
		||||
PROFILE  = #-profile-functions -g  -pg
 | 
			
		||||
 | 
			
		||||
ifeq ($(ISA),AVX512)
 | 
			
		||||
OPTS      = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
ifeq ($(ISA),AVX2)
 | 
			
		||||
OPTS     = -Ofast -xCORE-AVX2  $(PROFILE)
 | 
			
		||||
#OPTS     = -Ofast -xCORE-AVX2  $(PROFILE)
 | 
			
		||||
#OPTS     = -Ofast -xAVX  $(PROFILE)
 | 
			
		||||
#OPTS     = -Ofast -xAVX2  $(PROFILE)
 | 
			
		||||
#OPTS     = -Ofast -march=core-avx2 $(PROFILE)
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
ifeq ($(ISA),AVX)
 | 
			
		||||
OPTS     = -Ofast -xAVX  $(PROFILE)
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
ifeq ($(ISA),SSE)
 | 
			
		||||
OPTS     = -Ofast -xSSE4.2 $(PROFILE)
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
#OPTS     = -Ofast -xSSE4.2 $(PROFILE)
 | 
			
		||||
#OPTS     = -Ofast -no-vec $(PROFILE)
 | 
			
		||||
#OPTS     = -Ofast -xHost $(PROFILE)
 | 
			
		||||
CFLAGS   = $(PROFILE) -restrict $(OPENMP) $(OPTS)
 | 
			
		||||
 
 | 
			
		||||
@@ -3,28 +3,13 @@ LINKER = $(CC)
 | 
			
		||||
 | 
			
		||||
OPENMP  = #-qopenmp
 | 
			
		||||
PROFILE  = #-profile-functions -g  -pg
 | 
			
		||||
 | 
			
		||||
ifeq ($(ISA),AVX512)
 | 
			
		||||
OPTS      = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
 | 
			
		||||
#OPTS      = -Ofast -march=cascadelake -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
ifeq ($(ISA),AVX2)
 | 
			
		||||
OPTS     = -Ofast -xCORE-AVX2  $(PROFILE)
 | 
			
		||||
#OPTS     = -Ofast -xHost  $(PROFILE)
 | 
			
		||||
#OPTS     = -Ofast -march=core-avx2 $(PROFILE)
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
ifeq ($(ISA),AVX)
 | 
			
		||||
OPTS     = -Ofast -xAVX  $(PROFILE)
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
ifeq ($(ISA),SSE)
 | 
			
		||||
OPTS     = -Ofast -xSSE4.2 $(PROFILE)
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
#OPTS      = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE)
 | 
			
		||||
#OPTS     = -Ofast -xCORE-AVX2  $(PROFILE)
 | 
			
		||||
#OPTS     = -Ofast -xAVX  $(PROFILE)
 | 
			
		||||
#OPTS     = -Ofast -xAVX2  $(PROFILE)
 | 
			
		||||
#OPTS     = -Ofast -xSSE4.2 $(PROFILE)
 | 
			
		||||
#OPTS     = -Ofast -no-vec $(PROFILE)
 | 
			
		||||
#OPTS     = -Ofast -xHost $(PROFILE)
 | 
			
		||||
OPTS     = -Ofast -xHost $(PROFILE)
 | 
			
		||||
CFLAGS   = $(PROFILE) $(OPENMP) $(OPTS)
 | 
			
		||||
ASFLAGS  = #-masm=intel
 | 
			
		||||
LFLAGS   = $(PROFILE) $(OPTS) $(OPENMP)
 | 
			
		||||
 
 | 
			
		||||
@@ -9,15 +9,13 @@ else ifeq ($(strip $(ISA)), AVX_FMA)
 | 
			
		||||
    __ISA_AVX_FMA__=true
 | 
			
		||||
    __SIMD_WIDTH_DBL__=4
 | 
			
		||||
else ifeq ($(strip $(ISA)), AVX2)
 | 
			
		||||
    #__SIMD_KERNEL__=true
 | 
			
		||||
    __ISA_AVX2__=true
 | 
			
		||||
    #__SIMD_KERNEL__=true
 | 
			
		||||
    __SIMD_WIDTH_DBL__=4
 | 
			
		||||
else ifeq ($(strip $(ISA)), AVX512)
 | 
			
		||||
    __ISA_AVX512__=true
 | 
			
		||||
    __SIMD_KERNEL__=true
 | 
			
		||||
    __SIMD_WIDTH_DBL__=8
 | 
			
		||||
    ifeq ($(strip $(DATA_TYPE)), DP)
 | 
			
		||||
        __SIMD_KERNEL__=true
 | 
			
		||||
    endif
 | 
			
		||||
endif
 | 
			
		||||
 | 
			
		||||
# SIMD width is specified in double-precision, hence it may
 | 
			
		||||
 
 | 
			
		||||
@@ -8,7 +8,8 @@ ANSI_CFLAGS += -Wextra
 | 
			
		||||
 | 
			
		||||
#
 | 
			
		||||
# A100 + Native
 | 
			
		||||
CFLAGS   = -O3 -arch=sm_80 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
 | 
			
		||||
#CFLAGS   = -O3 -arch=sm_80 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
 | 
			
		||||
CFLAGS   = -O3 -arch=compute_61 -code=sm_61,sm_80,sm_86 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
 | 
			
		||||
# A40 + Native
 | 
			
		||||
#CFLAGS   = -O3 -arch=sm_86 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
 | 
			
		||||
# Cascade Lake
 | 
			
		||||
 
 | 
			
		||||
@@ -31,12 +31,8 @@ double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbo
 | 
			
		||||
    int nrho = eam->nrho; int nrho_tot = eam->nrho_tot;
 | 
			
		||||
    double S = getTimeStamp();
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    #pragma omp parallel
 | 
			
		||||
    {
 | 
			
		||||
    LIKWID_MARKER_START("force_eam_fp");
 | 
			
		||||
 | 
			
		||||
    #pragma omp for
 | 
			
		||||
    #pragma omp parallel for
 | 
			
		||||
    for(int i = 0; i < Nlocal; i++) {
 | 
			
		||||
        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
 | 
			
		||||
        int numneighs = neighbor->numneigh[i];
 | 
			
		||||
@@ -99,19 +95,13 @@ double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbo
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    LIKWID_MARKER_STOP("force_eam_fp");
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // We still need to update fp for PBC atoms
 | 
			
		||||
    for(int i = 0; i < atom->Nghost; i++) {
 | 
			
		||||
        fp[Nlocal + i] = fp[atom->border_map[i]];
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    #pragma omp parallel
 | 
			
		||||
    {
 | 
			
		||||
    LIKWID_MARKER_START("force_eam");
 | 
			
		||||
 | 
			
		||||
    #pragma omp for
 | 
			
		||||
    for(int i = 0; i < Nlocal; i++) {
 | 
			
		||||
        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
 | 
			
		||||
        int numneighs = neighbor->numneigh[i];
 | 
			
		||||
@@ -202,8 +192,6 @@ double computeForceEam(Eam* eam, Parameter* param, Atom *atom, Neighbor *neighbo
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    LIKWID_MARKER_STOP("force_eam");
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    double E = getTimeStamp();
 | 
			
		||||
    return E-S;
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -26,22 +26,17 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
 | 
			
		||||
    MD_FLOAT sigma6 = param->sigma6;
 | 
			
		||||
    MD_FLOAT epsilon = param->epsilon;
 | 
			
		||||
    #endif
 | 
			
		||||
    const MD_FLOAT num1 = 1.0;
 | 
			
		||||
    const MD_FLOAT num48 = 48.0;
 | 
			
		||||
    const MD_FLOAT num05 = 0.5;
 | 
			
		||||
 | 
			
		||||
    for(int i = 0; i < Nlocal; i++) {
 | 
			
		||||
        atom_fx(i) = 0.0;
 | 
			
		||||
        atom_fy(i) = 0.0;
 | 
			
		||||
        atom_fz(i) = 0.0;
 | 
			
		||||
    }
 | 
			
		||||
    double S = getTimeStamp();
 | 
			
		||||
 | 
			
		||||
    #pragma omp parallel
 | 
			
		||||
    {
 | 
			
		||||
    double S = getTimeStamp();
 | 
			
		||||
    LIKWID_MARKER_START("force");
 | 
			
		||||
 | 
			
		||||
    #pragma omp for
 | 
			
		||||
    #pragma omp parallel for
 | 
			
		||||
    for(int i = 0; i < Nlocal; i++) {
 | 
			
		||||
        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
 | 
			
		||||
        int numneighs = neighbor->numneigh[i];
 | 
			
		||||
@@ -72,9 +67,9 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
 | 
			
		||||
            #endif
 | 
			
		||||
 | 
			
		||||
            if(rsq < cutforcesq) {
 | 
			
		||||
                MD_FLOAT sr2 = num1 / rsq;
 | 
			
		||||
                MD_FLOAT sr2 = 1.0 / rsq;
 | 
			
		||||
                MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
 | 
			
		||||
                MD_FLOAT force = num48 * sr6 * (sr6 - num05) * sr2 * epsilon;
 | 
			
		||||
                MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
 | 
			
		||||
                fix += delx * force;
 | 
			
		||||
                fiy += dely * force;
 | 
			
		||||
                fiz += delz * force;
 | 
			
		||||
@@ -95,8 +90,6 @@ double computeForceLJFullNeigh_plain_c(Parameter *param, Atom *atom, Neighbor *n
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    LIKWID_MARKER_STOP("force");
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    double E = getTimeStamp();
 | 
			
		||||
    return E-S;
 | 
			
		||||
}
 | 
			
		||||
@@ -109,9 +102,6 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
 | 
			
		||||
    MD_FLOAT sigma6 = param->sigma6;
 | 
			
		||||
    MD_FLOAT epsilon = param->epsilon;
 | 
			
		||||
    #endif
 | 
			
		||||
    const MD_FLOAT num1 = 1.0;
 | 
			
		||||
    const MD_FLOAT num48 = 48.0;
 | 
			
		||||
    const MD_FLOAT num05 = 0.5;
 | 
			
		||||
 | 
			
		||||
    for(int i = 0; i < Nlocal; i++) {
 | 
			
		||||
        atom_fx(i) = 0.0;
 | 
			
		||||
@@ -120,12 +110,8 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    double S = getTimeStamp();
 | 
			
		||||
 | 
			
		||||
    #pragma omp parallel
 | 
			
		||||
    {
 | 
			
		||||
    LIKWID_MARKER_START("forceLJ-halfneigh");
 | 
			
		||||
 | 
			
		||||
    #pragma omp for
 | 
			
		||||
    for(int i = 0; i < Nlocal; i++) {
 | 
			
		||||
        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
 | 
			
		||||
        int numneighs = neighbor->numneigh[i];
 | 
			
		||||
@@ -160,9 +146,9 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
 | 
			
		||||
            #endif
 | 
			
		||||
 | 
			
		||||
            if(rsq < cutforcesq) {
 | 
			
		||||
                MD_FLOAT sr2 = num1 / rsq;
 | 
			
		||||
                MD_FLOAT sr2 = 1.0 / rsq;
 | 
			
		||||
                MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
 | 
			
		||||
                MD_FLOAT force = num48 * sr6 * (sr6 - num05) * sr2 * epsilon;
 | 
			
		||||
                MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
 | 
			
		||||
                fix += delx * force;
 | 
			
		||||
                fiy += dely * force;
 | 
			
		||||
                fiz += delz * force;
 | 
			
		||||
@@ -185,8 +171,6 @@ double computeForceLJHalfNeigh(Parameter *param, Atom *atom, Neighbor *neighbor,
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    LIKWID_MARKER_STOP("forceLJ-halfneigh");
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    double E = getTimeStamp();
 | 
			
		||||
    return E-S;
 | 
			
		||||
}
 | 
			
		||||
@@ -205,6 +189,7 @@ double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neig
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    double S = getTimeStamp();
 | 
			
		||||
    LIKWID_MARKER_START("force");
 | 
			
		||||
 | 
			
		||||
    #ifndef __SIMD_KERNEL__
 | 
			
		||||
    fprintf(stderr, "Error: SIMD kernel not implemented for specified instruction set!");
 | 
			
		||||
@@ -216,12 +201,7 @@ double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neig
 | 
			
		||||
    MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
 | 
			
		||||
    MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    #pragma omp parallel
 | 
			
		||||
    {
 | 
			
		||||
    LIKWID_MARKER_START("force");
 | 
			
		||||
 | 
			
		||||
    #pragma omp for
 | 
			
		||||
    #pragma omp parallel for
 | 
			
		||||
    for(int i = 0; i < Nlocal; i++) {
 | 
			
		||||
        neighs = &neighbor->neighbors[i * neighbor->maxneighs];
 | 
			
		||||
        int numneighs = neighbor->numneigh[i];
 | 
			
		||||
@@ -262,11 +242,9 @@ double computeForceLJFullNeigh_simd(Parameter *param, Atom *atom, Neighbor *neig
 | 
			
		||||
        atom_fy(i) += simd_h_reduce_sum(fiy);
 | 
			
		||||
        atom_fz(i) += simd_h_reduce_sum(fiz);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    LIKWID_MARKER_STOP("force");
 | 
			
		||||
    }
 | 
			
		||||
    #endif
 | 
			
		||||
 | 
			
		||||
    LIKWID_MARKER_STOP("force");
 | 
			
		||||
    double E = getTimeStamp();
 | 
			
		||||
    return E-S;
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
@@ -1,88 +0,0 @@
 | 
			
		||||
--------------------------------------------------------------------------------
 | 
			
		||||
CPU name:	Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
 | 
			
		||||
CPU type:	Intel Cascadelake SP processor
 | 
			
		||||
CPU clock:	2.49 GHz
 | 
			
		||||
--------------------------------------------------------------------------------
 | 
			
		||||
Initializing parameters...
 | 
			
		||||
Initializing atoms...
 | 
			
		||||
Creating atoms...
 | 
			
		||||
Pattern: seq
 | 
			
		||||
Number of timesteps: 200
 | 
			
		||||
Number of atoms: 256
 | 
			
		||||
Number of neighbors per atom: 1024
 | 
			
		||||
Number of times to replicate neighbor lists: 1
 | 
			
		||||
Estimated total data volume (kB): 1062.9120
 | 
			
		||||
Estimated atom data volume (kB): 6.1440
 | 
			
		||||
Estimated neighborlist data volume (kB): 1050.6240
 | 
			
		||||
Initializing neighbor lists...
 | 
			
		||||
Creating neighbor lists...
 | 
			
		||||
Computing forces...
 | 
			
		||||
Total time: 0.2735, Mega atom updates/s: 0.1872
 | 
			
		||||
Cycles per atom: 10682.8568, Cycles per neighbor: 10.4325
 | 
			
		||||
Statistics:
 | 
			
		||||
	Vector width: 8, Processor frequency: 2.0000 GHz
 | 
			
		||||
	Average neighbors per atom: 1018.9055
 | 
			
		||||
	Average SIMD iterations per atom: 127.3632
 | 
			
		||||
	Total number of computed pair interactions: 52428800
 | 
			
		||||
	Total number of SIMD iterations: 6553600
 | 
			
		||||
	Useful read data volume for force computation: 1.47GB
 | 
			
		||||
	Cycles/SIMD iteration: 83.4598
 | 
			
		||||
--------------------------------------------------------------------------------
 | 
			
		||||
Region force, Group 1: MEM_DP
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
|    Region Info    | HWThread 0 |
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
| RDTSC Runtime [s] |   0.110776 |
 | 
			
		||||
|     call count    |        200 |
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
 | 
			
		||||
+------------------------------------------+---------+------------+
 | 
			
		||||
|                   Event                  | Counter | HWThread 0 |
 | 
			
		||||
+------------------------------------------+---------+------------+
 | 
			
		||||
|             INSTR_RETIRED_ANY            |  FIXC0  |  267036300 |
 | 
			
		||||
|           CPU_CLK_UNHALTED_CORE          |  FIXC1  |  219034500 |
 | 
			
		||||
|           CPU_CLK_UNHALTED_REF           |  FIXC2  |  273793400 |
 | 
			
		||||
|              PWR_PKG_ENERGY              |   PWR0  |    10.9296 |
 | 
			
		||||
|              PWR_DRAM_ENERGY             |   PWR3  |          0 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE |   PMC0  |          0 |
 | 
			
		||||
|    FP_ARITH_INST_RETIRED_SCALAR_DOUBLE   |   PMC1  |     159400 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE |   PMC2  |          0 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE |   PMC3  |  197068800 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX0C0 |       8643 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX0C1 |       1367 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX1C0 |       9124 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX1C1 |       1354 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX2C0 |       9138 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX2C1 |       1356 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX3C0 |       5586 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX3C1 |       1297 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX4C0 |       5328 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX4C1 |       1269 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX5C0 |       5280 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX5C1 |       1295 |
 | 
			
		||||
+------------------------------------------+---------+------------+
 | 
			
		||||
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
|               Metric              | HWThread 0 |
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
|        Runtime (RDTSC) [s]        |     0.1108 |
 | 
			
		||||
|        Runtime unhalted [s]       |     0.0878 |
 | 
			
		||||
|            Clock [MHz]            |  1995.2564 |
 | 
			
		||||
|                CPI                |     0.8202 |
 | 
			
		||||
|             Energy [J]            |    10.9296 |
 | 
			
		||||
|             Power [W]             |    98.6643 |
 | 
			
		||||
|          Energy DRAM [J]          |          0 |
 | 
			
		||||
|           Power DRAM [W]          |          0 |
 | 
			
		||||
|            DP [MFLOP/s]           | 14233.3287 |
 | 
			
		||||
|          AVX DP [MFLOP/s]         | 14231.8898 |
 | 
			
		||||
|          Packed [MUOPS/s]         |  1778.9862 |
 | 
			
		||||
|          Scalar [MUOPS/s]         |     1.4389 |
 | 
			
		||||
|  Memory read bandwidth [MBytes/s] |    24.9001 |
 | 
			
		||||
|  Memory read data volume [GBytes] |     0.0028 |
 | 
			
		||||
| Memory write bandwidth [MBytes/s] |     4.5861 |
 | 
			
		||||
| Memory write data volume [GBytes] |     0.0005 |
 | 
			
		||||
|    Memory bandwidth [MBytes/s]    |    29.4863 |
 | 
			
		||||
|    Memory data volume [GBytes]    |     0.0033 |
 | 
			
		||||
|       Operational intensity       |   482.7104 |
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
 | 
			
		||||
@@ -1,168 +0,0 @@
 | 
			
		||||
--------------------------------------------------------------------------------
 | 
			
		||||
CPU name:	Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
 | 
			
		||||
CPU type:	Intel Cascadelake SP processor
 | 
			
		||||
CPU clock:	2.49 GHz
 | 
			
		||||
--------------------------------------------------------------------------------
 | 
			
		||||
Parameters:
 | 
			
		||||
	Force field: lj
 | 
			
		||||
	Kernel: plain-C
 | 
			
		||||
	Data layout: AoS
 | 
			
		||||
	Floating-point precision: double
 | 
			
		||||
	Unit cells (nx, ny, nz): 32, 32, 32
 | 
			
		||||
	Domain box sizes (x, y, z): 5.374708e+01, 5.374708e+01, 5.374708e+01
 | 
			
		||||
	Periodic (x, y, z): 1, 1, 1
 | 
			
		||||
	Lattice size: 1.679596e+00
 | 
			
		||||
	Epsilon: 1.000000e+00
 | 
			
		||||
	Sigma: 1.000000e+00
 | 
			
		||||
	Spring constant: 1.000000e+00
 | 
			
		||||
	Damping constant: 1.000000e+00
 | 
			
		||||
	Temperature: 1.440000e+00
 | 
			
		||||
	RHO: 8.442000e-01
 | 
			
		||||
	Mass: 1.000000e+00
 | 
			
		||||
	Number of types: 4
 | 
			
		||||
	Number of timesteps: 200
 | 
			
		||||
	Report stats every (timesteps): 100
 | 
			
		||||
	Reneighbor every (timesteps): 20
 | 
			
		||||
	Prune every (timesteps): 1000
 | 
			
		||||
	Output positions every (timesteps): 20
 | 
			
		||||
	Output velocities every (timesteps): 5
 | 
			
		||||
	Delta time (dt): 5.000000e-03
 | 
			
		||||
	Cutoff radius: 2.500000e+00
 | 
			
		||||
	Skin: 3.000000e-01
 | 
			
		||||
	Half neighbor lists: 0
 | 
			
		||||
	Processor frequency (GHz): 2.0000
 | 
			
		||||
----------------------------------------------------------------------------
 | 
			
		||||
step	temp		pressure
 | 
			
		||||
0	1.440000e+00	1.215639e+00
 | 
			
		||||
100	8.200895e-01	6.923143e-01
 | 
			
		||||
200	7.961495e-01	6.721043e-01
 | 
			
		||||
----------------------------------------------------------------------------
 | 
			
		||||
System: 131072 atoms 47265 ghost atoms, Steps: 200
 | 
			
		||||
TOTAL 11.50s FORCE 5.28s NEIGH 5.91s REST 0.31s
 | 
			
		||||
----------------------------------------------------------------------------
 | 
			
		||||
Performance: 2.28 million atom updates per second
 | 
			
		||||
Statistics:
 | 
			
		||||
	Vector width: 8, Processor frequency: 2.0000 GHz
 | 
			
		||||
	Average neighbors per atom: 76.0352
 | 
			
		||||
	Average SIMD iterations per atom: 9.9181
 | 
			
		||||
	Total number of computed pair interactions: 2003182862
 | 
			
		||||
	Total number of SIMD iterations: 261297661
 | 
			
		||||
	Useful read data volume for force computation: 57.46GB
 | 
			
		||||
	Cycles/SIMD iteration: 40.4432
 | 
			
		||||
--------------------------------------------------------------------------------
 | 
			
		||||
Region force, Group 1: MEM_DP
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
|    Region Info    | HWThread 0 |
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
| RDTSC Runtime [s] |   5.115807 |
 | 
			
		||||
|     call count    |        201 |
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
 | 
			
		||||
+------------------------------------------+---------+-------------+
 | 
			
		||||
|                   Event                  | Counter |  HWThread 0 |
 | 
			
		||||
+------------------------------------------+---------+-------------+
 | 
			
		||||
|             INSTR_RETIRED_ANY            |  FIXC0  | 12592470000 |
 | 
			
		||||
|           CPU_CLK_UNHALTED_CORE          |  FIXC1  | 10196910000 |
 | 
			
		||||
|           CPU_CLK_UNHALTED_REF           |  FIXC2  | 12746120000 |
 | 
			
		||||
|              PWR_PKG_ENERGY              |   PWR0  |    307.9429 |
 | 
			
		||||
|              PWR_DRAM_ENERGY             |   PWR3  |           0 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE |   PMC0  |           0 |
 | 
			
		||||
|    FP_ARITH_INST_RETIRED_SCALAR_DOUBLE   |   PMC1  |    79042240 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE |   PMC2  |           0 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE |   PMC3  |  8076039000 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX0C0 |    22734550 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX0C1 |     1147714 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX1C0 |    22755180 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX1C1 |     1144415 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX2C0 |    22762780 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX2C1 |     1129051 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX3C0 |    22905660 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX3C1 |     1143324 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX4C0 |    22914860 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX4C1 |     1169116 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX5C0 |    22890220 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX5C1 |     1180739 |
 | 
			
		||||
+------------------------------------------+---------+-------------+
 | 
			
		||||
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
|               Metric              | HWThread 0 |
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
|        Runtime (RDTSC) [s]        |     5.1158 |
 | 
			
		||||
|        Runtime unhalted [s]       |     4.0885 |
 | 
			
		||||
|            Clock [MHz]            |  1995.2508 |
 | 
			
		||||
|                CPI                |     0.8098 |
 | 
			
		||||
|             Energy [J]            |   307.9429 |
 | 
			
		||||
|             Power [W]             |    60.1944 |
 | 
			
		||||
|          Energy DRAM [J]          |          0 |
 | 
			
		||||
|           Power DRAM [W]          |          0 |
 | 
			
		||||
|            DP [MFLOP/s]           | 12644.6041 |
 | 
			
		||||
|          AVX DP [MFLOP/s]         | 12629.1535 |
 | 
			
		||||
|          Packed [MUOPS/s]         |  1578.6442 |
 | 
			
		||||
|          Scalar [MUOPS/s]         |    15.4506 |
 | 
			
		||||
|  Memory read bandwidth [MBytes/s] |  1713.4438 |
 | 
			
		||||
|  Memory read data volume [GBytes] |     8.7656 |
 | 
			
		||||
| Memory write bandwidth [MBytes/s] |    86.5003 |
 | 
			
		||||
| Memory write data volume [GBytes] |     0.4425 |
 | 
			
		||||
|    Memory bandwidth [MBytes/s]    |  1799.9442 |
 | 
			
		||||
|    Memory data volume [GBytes]    |     9.2082 |
 | 
			
		||||
|       Operational intensity       |     7.0250 |
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
 | 
			
		||||
Region reneighbour, Group 1: MEM_DP
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
|    Region Info    | HWThread 0 |
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
| RDTSC Runtime [s] |   5.897385 |
 | 
			
		||||
|     call count    |         10 |
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
 | 
			
		||||
+------------------------------------------+---------+-------------+
 | 
			
		||||
|                   Event                  | Counter |  HWThread 0 |
 | 
			
		||||
+------------------------------------------+---------+-------------+
 | 
			
		||||
|             INSTR_RETIRED_ANY            |  FIXC0  | 18212540000 |
 | 
			
		||||
|           CPU_CLK_UNHALTED_CORE          |  FIXC1  | 11728500000 |
 | 
			
		||||
|           CPU_CLK_UNHALTED_REF           |  FIXC2  | 14660630000 |
 | 
			
		||||
|              PWR_PKG_ENERGY              |   PWR0  |    338.9000 |
 | 
			
		||||
|              PWR_DRAM_ENERGY             |   PWR3  |           0 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE |   PMC0  |           0 |
 | 
			
		||||
|    FP_ARITH_INST_RETIRED_SCALAR_DOUBLE   |   PMC1  |  6240402000 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE |   PMC2  |           0 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_512B_PACKED_DOUBLE |   PMC3  |      983040 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX0C0 |     2086787 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX0C1 |     1115626 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX1C0 |     2089964 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX1C1 |     1117021 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX2C0 |     2103832 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX2C1 |     1117965 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX3C0 |     2086930 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX3C1 |     1102471 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX4C0 |     2094688 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX4C1 |     1103018 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX5C0 |     2097438 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX5C1 |     1102525 |
 | 
			
		||||
+------------------------------------------+---------+-------------+
 | 
			
		||||
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
|               Metric              | HWThread 0 |
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
|        Runtime (RDTSC) [s]        |     5.8974 |
 | 
			
		||||
|        Runtime unhalted [s]       |     4.7026 |
 | 
			
		||||
|            Clock [MHz]            |  1995.2473 |
 | 
			
		||||
|                CPI                |     0.6440 |
 | 
			
		||||
|             Energy [J]            |   338.9000 |
 | 
			
		||||
|             Power [W]             |    57.4661 |
 | 
			
		||||
|          Energy DRAM [J]          |          0 |
 | 
			
		||||
|           Power DRAM [W]          |          0 |
 | 
			
		||||
|            DP [MFLOP/s]           |  1059.4978 |
 | 
			
		||||
|          AVX DP [MFLOP/s]         |     1.3335 |
 | 
			
		||||
|          Packed [MUOPS/s]         |     0.1667 |
 | 
			
		||||
|          Scalar [MUOPS/s]         |  1058.1643 |
 | 
			
		||||
|  Memory read bandwidth [MBytes/s] |   136.3006 |
 | 
			
		||||
|  Memory read data volume [GBytes] |     0.8038 |
 | 
			
		||||
| Memory write bandwidth [MBytes/s] |    72.2612 |
 | 
			
		||||
| Memory write data volume [GBytes] |     0.4262 |
 | 
			
		||||
|    Memory bandwidth [MBytes/s]    |   208.5618 |
 | 
			
		||||
|    Memory data volume [GBytes]    |     1.2300 |
 | 
			
		||||
|       Operational intensity       |     5.0800 |
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
 | 
			
		||||
@@ -1,88 +0,0 @@
 | 
			
		||||
--------------------------------------------------------------------------------
 | 
			
		||||
CPU name:	Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
 | 
			
		||||
CPU type:	Intel Cascadelake SP processor
 | 
			
		||||
CPU clock:	2.49 GHz
 | 
			
		||||
--------------------------------------------------------------------------------
 | 
			
		||||
Initializing parameters...
 | 
			
		||||
Initializing atoms...
 | 
			
		||||
Creating atoms...
 | 
			
		||||
Pattern: seq
 | 
			
		||||
Number of timesteps: 200
 | 
			
		||||
Number of atoms: 256
 | 
			
		||||
Number of neighbors per atom: 1024
 | 
			
		||||
Number of times to replicate neighbor lists: 1
 | 
			
		||||
Estimated total data volume (kB): 1056.7680
 | 
			
		||||
Estimated atom data volume (kB): 3.0720
 | 
			
		||||
Estimated neighborlist data volume (kB): 1050.6240
 | 
			
		||||
Initializing neighbor lists...
 | 
			
		||||
Creating neighbor lists...
 | 
			
		||||
Computing forces...
 | 
			
		||||
Total time: 0.2466, Mega atom updates/s: 0.2076
 | 
			
		||||
Cycles per atom: 9631.9934, Cycles per neighbor: 9.4062
 | 
			
		||||
Statistics:
 | 
			
		||||
	Vector width: 16, Processor frequency: 2.0000 GHz
 | 
			
		||||
	Average neighbors per atom: 1018.9055
 | 
			
		||||
	Average SIMD iterations per atom: 63.6816
 | 
			
		||||
	Total number of computed pair interactions: 52428800
 | 
			
		||||
	Total number of SIMD iterations: 3276800
 | 
			
		||||
	Useful read data volume for force computation: 0.84GB
 | 
			
		||||
	Cycles/SIMD iteration: 150.4999
 | 
			
		||||
--------------------------------------------------------------------------------
 | 
			
		||||
Region force, Group 1: MEM_SP
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
|    Region Info    | HWThread 0 |
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
| RDTSC Runtime [s] |   0.085843 |
 | 
			
		||||
|     call count    |        200 |
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
 | 
			
		||||
+------------------------------------------+---------+------------+
 | 
			
		||||
|                   Event                  | Counter | HWThread 0 |
 | 
			
		||||
+------------------------------------------+---------+------------+
 | 
			
		||||
|             INSTR_RETIRED_ANY            |  FIXC0  |  129769100 |
 | 
			
		||||
|           CPU_CLK_UNHALTED_CORE          |  FIXC1  |  172300100 |
 | 
			
		||||
|           CPU_CLK_UNHALTED_REF           |  FIXC2  |  215371300 |
 | 
			
		||||
|              PWR_PKG_ENERGY              |   PWR0  |     9.2849 |
 | 
			
		||||
|              PWR_DRAM_ENERGY             |   PWR3  |          0 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE |   PMC0  |          0 |
 | 
			
		||||
|    FP_ARITH_INST_RETIRED_SCALAR_SINGLE   |   PMC1  |     154000 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE |   PMC2  |          0 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE |   PMC3  |   89088000 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX0C0 |       8354 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX0C1 |       1126 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX1C0 |       7863 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX1C1 |       1105 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX2C0 |       7990 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX2C1 |       1113 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX3C0 |       4775 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX3C1 |       1112 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX4C0 |       4201 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX4C1 |       1127 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX5C0 |       4035 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX5C1 |       1120 |
 | 
			
		||||
+------------------------------------------+---------+------------+
 | 
			
		||||
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
|               Metric              | HWThread 0 |
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
|        Runtime (RDTSC) [s]        |     0.0858 |
 | 
			
		||||
|        Runtime unhalted [s]       |     0.0691 |
 | 
			
		||||
|            Clock [MHz]            |  1995.2787 |
 | 
			
		||||
|                CPI                |     1.3277 |
 | 
			
		||||
|             Energy [J]            |     9.2849 |
 | 
			
		||||
|             Power [W]             |   108.1610 |
 | 
			
		||||
|          Energy DRAM [J]          |          0 |
 | 
			
		||||
|           Power DRAM [W]          |          0 |
 | 
			
		||||
|            SP [MFLOP/s]           | 16606.5397 |
 | 
			
		||||
|          AVX SP [MFLOP/s]         | 16604.7458 |
 | 
			
		||||
|          Packed [MUOPS/s]         |  1037.7966 |
 | 
			
		||||
|          Scalar [MUOPS/s]         |     1.7940 |
 | 
			
		||||
|  Memory read bandwidth [MBytes/s] |    27.7476 |
 | 
			
		||||
|  Memory read data volume [GBytes] |     0.0024 |
 | 
			
		||||
| Memory write bandwidth [MBytes/s] |     4.9974 |
 | 
			
		||||
| Memory write data volume [GBytes] |     0.0004 |
 | 
			
		||||
|    Memory bandwidth [MBytes/s]    |    32.7450 |
 | 
			
		||||
|    Memory data volume [GBytes]    |     0.0028 |
 | 
			
		||||
|       Operational intensity       |   507.1471 |
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
 | 
			
		||||
@@ -1,168 +0,0 @@
 | 
			
		||||
--------------------------------------------------------------------------------
 | 
			
		||||
CPU name:	Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz
 | 
			
		||||
CPU type:	Intel Cascadelake SP processor
 | 
			
		||||
CPU clock:	2.49 GHz
 | 
			
		||||
--------------------------------------------------------------------------------
 | 
			
		||||
Parameters:
 | 
			
		||||
	Force field: lj
 | 
			
		||||
	Kernel: plain-C
 | 
			
		||||
	Data layout: AoS
 | 
			
		||||
	Floating-point precision: single
 | 
			
		||||
	Unit cells (nx, ny, nz): 32, 32, 32
 | 
			
		||||
	Domain box sizes (x, y, z): 5.374708e+01, 5.374708e+01, 5.374708e+01
 | 
			
		||||
	Periodic (x, y, z): 1, 1, 1
 | 
			
		||||
	Lattice size: 1.679596e+00
 | 
			
		||||
	Epsilon: 1.000000e+00
 | 
			
		||||
	Sigma: 1.000000e+00
 | 
			
		||||
	Spring constant: 1.000000e+00
 | 
			
		||||
	Damping constant: 1.000000e+00
 | 
			
		||||
	Temperature: 1.440000e+00
 | 
			
		||||
	RHO: 8.442000e-01
 | 
			
		||||
	Mass: 1.000000e+00
 | 
			
		||||
	Number of types: 4
 | 
			
		||||
	Number of timesteps: 200
 | 
			
		||||
	Report stats every (timesteps): 100
 | 
			
		||||
	Reneighbor every (timesteps): 20
 | 
			
		||||
	Prune every (timesteps): 1000
 | 
			
		||||
	Output positions every (timesteps): 20
 | 
			
		||||
	Output velocities every (timesteps): 5
 | 
			
		||||
	Delta time (dt): 5.000000e-03
 | 
			
		||||
	Cutoff radius: 2.500000e+00
 | 
			
		||||
	Skin: 3.000000e-01
 | 
			
		||||
	Half neighbor lists: 0
 | 
			
		||||
	Processor frequency (GHz): 2.0000
 | 
			
		||||
----------------------------------------------------------------------------
 | 
			
		||||
step	temp		pressure
 | 
			
		||||
0	1.440000e+00	1.215639e+00
 | 
			
		||||
100	8.200897e-01	6.923144e-01
 | 
			
		||||
200	7.961481e-01	6.721031e-01
 | 
			
		||||
----------------------------------------------------------------------------
 | 
			
		||||
System: 131072 atoms 47265 ghost atoms, Steps: 200
 | 
			
		||||
TOTAL 10.83s FORCE 4.62s NEIGH 5.94s REST 0.26s
 | 
			
		||||
----------------------------------------------------------------------------
 | 
			
		||||
Performance: 2.42 million atom updates per second
 | 
			
		||||
Statistics:
 | 
			
		||||
	Vector width: 16, Processor frequency: 2.0000 GHz
 | 
			
		||||
	Average neighbors per atom: 76.0351
 | 
			
		||||
	Average SIMD iterations per atom: 5.0875
 | 
			
		||||
	Total number of computed pair interactions: 2003181259
 | 
			
		||||
	Total number of SIMD iterations: 134032075
 | 
			
		||||
	Useful read data volume for force computation: 32.79GB
 | 
			
		||||
	Cycles/SIMD iteration: 68.9511
 | 
			
		||||
--------------------------------------------------------------------------------
 | 
			
		||||
Region force, Group 1: MEM_SP
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
|    Region Info    | HWThread 0 |
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
| RDTSC Runtime [s] |   4.452877 |
 | 
			
		||||
|     call count    |        201 |
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
 | 
			
		||||
+------------------------------------------+---------+-------------+
 | 
			
		||||
|                   Event                  | Counter |  HWThread 0 |
 | 
			
		||||
+------------------------------------------+---------+-------------+
 | 
			
		||||
|             INSTR_RETIRED_ANY            |  FIXC0  |  7428719000 |
 | 
			
		||||
|           CPU_CLK_UNHALTED_CORE          |  FIXC1  |  8875251000 |
 | 
			
		||||
|           CPU_CLK_UNHALTED_REF           |  FIXC2  | 11094050000 |
 | 
			
		||||
|              PWR_PKG_ENERGY              |   PWR0  |    265.5057 |
 | 
			
		||||
|              PWR_DRAM_ENERGY             |   PWR3  |           0 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE |   PMC0  |           0 |
 | 
			
		||||
|    FP_ARITH_INST_RETIRED_SCALAR_SINGLE   |   PMC1  |    79036820 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE |   PMC2  |           0 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE |   PMC3  |  3935012000 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX0C0 |    19716700 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX0C1 |      595747 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX1C0 |    19734880 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX1C1 |      597090 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX2C0 |    19732800 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX2C1 |      595219 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX3C0 |    19886430 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX3C1 |      632443 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX4C0 |    19887210 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX4C1 |      633169 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX5C0 |    19935560 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX5C1 |      634112 |
 | 
			
		||||
+------------------------------------------+---------+-------------+
 | 
			
		||||
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
|               Metric              | HWThread 0 |
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
|        Runtime (RDTSC) [s]        |     4.4529 |
 | 
			
		||||
|        Runtime unhalted [s]       |     3.5585 |
 | 
			
		||||
|            Clock [MHz]            |  1995.2693 |
 | 
			
		||||
|                CPI                |     1.1947 |
 | 
			
		||||
|             Energy [J]            |   265.5057 |
 | 
			
		||||
|             Power [W]             |    59.6257 |
 | 
			
		||||
|          Energy DRAM [J]          |          0 |
 | 
			
		||||
|           Power DRAM [W]          |          0 |
 | 
			
		||||
|            SP [MFLOP/s]           | 14156.9661 |
 | 
			
		||||
|          AVX SP [MFLOP/s]         | 14139.2165 |
 | 
			
		||||
|          Packed [MUOPS/s]         |   883.7010 |
 | 
			
		||||
|          Scalar [MUOPS/s]         |    17.7496 |
 | 
			
		||||
|  Memory read bandwidth [MBytes/s] |  1708.8254 |
 | 
			
		||||
|  Memory read data volume [GBytes] |     7.6092 |
 | 
			
		||||
| Memory write bandwidth [MBytes/s] |    53.0035 |
 | 
			
		||||
| Memory write data volume [GBytes] |     0.2360 |
 | 
			
		||||
|    Memory bandwidth [MBytes/s]    |  1761.8288 |
 | 
			
		||||
|    Memory data volume [GBytes]    |     7.8452 |
 | 
			
		||||
|       Operational intensity       |     8.0354 |
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
 | 
			
		||||
Region reneighbour, Group 1: MEM_SP
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
|    Region Info    | HWThread 0 |
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
| RDTSC Runtime [s] |   5.935627 |
 | 
			
		||||
|     call count    |         10 |
 | 
			
		||||
+-------------------+------------+
 | 
			
		||||
 | 
			
		||||
+------------------------------------------+---------+-------------+
 | 
			
		||||
|                   Event                  | Counter |  HWThread 0 |
 | 
			
		||||
+------------------------------------------+---------+-------------+
 | 
			
		||||
|             INSTR_RETIRED_ANY            |  FIXC0  | 18208530000 |
 | 
			
		||||
|           CPU_CLK_UNHALTED_CORE          |  FIXC1  | 11805500000 |
 | 
			
		||||
|           CPU_CLK_UNHALTED_REF           |  FIXC2  | 14756870000 |
 | 
			
		||||
|              PWR_PKG_ENERGY              |   PWR0  |    340.7903 |
 | 
			
		||||
|              PWR_DRAM_ENERGY             |   PWR3  |           0 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE |   PMC0  |           0 |
 | 
			
		||||
|    FP_ARITH_INST_RETIRED_SCALAR_SINGLE   |   PMC1  |  6240406000 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE |   PMC2  |           0 |
 | 
			
		||||
| FP_ARITH_INST_RETIRED_512B_PACKED_SINGLE |   PMC3  |      491520 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX0C0 |     1772377 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX0C1 |      975760 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX1C0 |     1770611 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX1C1 |      977433 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX2C0 |     1771722 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX2C1 |      979122 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX3C0 |     1782901 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX3C1 |      967621 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX4C0 |     1780789 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX4C1 |      967179 |
 | 
			
		||||
|               CAS_COUNT_RD               | MBOX5C0 |     1784733 |
 | 
			
		||||
|               CAS_COUNT_WR               | MBOX5C1 |      969349 |
 | 
			
		||||
+------------------------------------------+---------+-------------+
 | 
			
		||||
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
|               Metric              | HWThread 0 |
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
|        Runtime (RDTSC) [s]        |     5.9356 |
 | 
			
		||||
|        Runtime unhalted [s]       |     4.7334 |
 | 
			
		||||
|            Clock [MHz]            |  1995.2675 |
 | 
			
		||||
|                CPI                |     0.6483 |
 | 
			
		||||
|             Energy [J]            |   340.7903 |
 | 
			
		||||
|             Power [W]             |    57.4144 |
 | 
			
		||||
|          Energy DRAM [J]          |          0 |
 | 
			
		||||
|           Power DRAM [W]          |          0 |
 | 
			
		||||
|            SP [MFLOP/s]           |  1052.6723 |
 | 
			
		||||
|          AVX SP [MFLOP/s]         |     1.3249 |
 | 
			
		||||
|          Packed [MUOPS/s]         |     0.0828 |
 | 
			
		||||
|          Scalar [MUOPS/s]         |  1051.3474 |
 | 
			
		||||
|  Memory read bandwidth [MBytes/s] |   114.9736 |
 | 
			
		||||
|  Memory read data volume [GBytes] |     0.6824 |
 | 
			
		||||
| Memory write bandwidth [MBytes/s] |    62.9308 |
 | 
			
		||||
| Memory write data volume [GBytes] |     0.3735 |
 | 
			
		||||
|    Memory bandwidth [MBytes/s]    |   177.9044 |
 | 
			
		||||
|    Memory data volume [GBytes]    |     1.0560 |
 | 
			
		||||
|       Operational intensity       |     5.9171 |
 | 
			
		||||
+-----------------------------------+------------+
 | 
			
		||||
 | 
			
		||||
@@ -1,148 +0,0 @@
 | 
			
		||||
Intel(R) Architecture Code Analyzer Version -  v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
 | 
			
		||||
Analyzed File -  gromacs-avx512-dp-ICX.o
 | 
			
		||||
Binary Format - 64Bit
 | 
			
		||||
Architecture  -  SKX
 | 
			
		||||
Analysis Type - Throughput
 | 
			
		||||
 | 
			
		||||
Throughput Analysis Report
 | 
			
		||||
--------------------------
 | 
			
		||||
Block Throughput: 47.68 Cycles       Throughput Bottleneck: Backend
 | 
			
		||||
Loop Count:  22
 | 
			
		||||
Port Binding In Cycles Per Iteration:
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
|  Port  |   0   -  DV   |   1   |   2   -  D    |   3   -  D    |   4   |   5   |   6   |   7   |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
| Cycles | 42.0     0.0  | 12.5  |  5.0     5.0  |  5.0     5.0  |  0.0  | 42.0  | 12.5  |  0.0  |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
DV - Divider pipe (on port 0)
 | 
			
		||||
D - Data fetch pipe (on ports 2 and 3)
 | 
			
		||||
F - Macro Fusion with the previous instruction occurred
 | 
			
		||||
* - instruction micro-ops not bound to a port
 | 
			
		||||
^ - Micro Fusion occurred
 | 
			
		||||
# - ESP Tracking sync uop was issued
 | 
			
		||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
 | 
			
		||||
X - instruction not supported, was not accounted in Analysis
 | 
			
		||||
 | 
			
		||||
| Num Of   |                    Ports pressure in cycles                         |      |
 | 
			
		||||
|  Uops    |  0  - DV    |  1   |  2  -  D    |  3  -  D    |  4   |  5   |  6   |  7   |
 | 
			
		||||
-----------------------------------------------------------------------------------------
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | movsxd rbx, dword ptr [r12+r14*4]
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea rcx, ptr [rbx+rbx*2]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl rcx, 0x6
 | 
			
		||||
|   1      |             |      |             | 1.0     1.0 |      |      |      |      | vmovapd zmm29, zmmword ptr [rsi+rcx*1]
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | vmovapd zmm30, zmmword ptr [rsi+rcx*1+0x40]
 | 
			
		||||
|   1      |             |      |             | 1.0     1.0 |      |      |      |      | vmovapd zmm31, zmmword ptr [rsi+rcx*1+0x80]
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | vmovupd zmm3, zmmword ptr [rsp+0x40]
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubpd zmm4, zmm3, zmm29
 | 
			
		||||
|   1      |             |      |             | 1.0     1.0 |      |      |      |      | vmovupd zmm3, zmmword ptr [rsp+0x140]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubpd zmm3, zmm3, zmm30
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea ecx, ptr [rbx+rbx*1]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | cmp rdi, rcx
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | setnz dl
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | setz cl
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea ebx, ptr [rbx+rbx*1+0x1]
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubpd zmm17, zmm25, zmm31
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm18, zmm17, zmm17
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm18, zmm3, zmm3
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm18, zmm4, zmm4
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14pd zmm19, zmm18
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | cmp rdi, rbx
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | setz bl
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | mov ebp, ebx
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm20, zmm19, zmm22
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm21, zmm19, zmm19
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm20, zmm21, zmm20
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | vmovupd zmm21, zmmword ptr [rsp+0x80]
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubpd zmm21, zmm21, zmm29
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl bpl, 0x4
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm19, zmm1, zmm19
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm19, zmm19, zmm20
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vaddpd zmm20, zmm20, zmm2
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm19, zmm19, zmm20
 | 
			
		||||
|   1      |             |      |             | 1.0     1.0 |      |      |      |      | vmovupd zmm20, zmmword ptr [rsp+0x100]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubpd zmm20, zmm20, zmm30
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | not bpl
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | sub bpl, cl
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovd k1, ebp
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmppd k1{k1}, zmm18, zmm0, 0x1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubpd zmm18, zmm26, zmm31
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm15{k1}, zmm19, zmm4
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm4, zmm18, zmm18
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm4, zmm20, zmm20
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm4, zmm21, zmm21
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm12{k1}, zmm19, zmm3
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14pd zmm3, zmm4
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea ecx, ptr [rdx+rdx*1]
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | mov eax, ebx
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm8{k1}, zmm19, zmm17
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm17, zmm3, zmm22
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm19, zmm3, zmm3
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm17, zmm19, zmm17
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | vmovupd zmm19, zmmword ptr [rsp+0x1c0]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubpd zmm19, zmm19, zmm29
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl al, 0x5
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm3, zmm1, zmm3
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm3, zmm3, zmm17
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vaddpd zmm17, zmm17, zmm2
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm3, zmm3, zmm17
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubpd zmm17, zmm23, zmm30
 | 
			
		||||
|   1      |             | 0.5  |             |             |      |      | 0.5  |      | sub cl, al
 | 
			
		||||
|   1      |             | 0.5  |             |             |      |      | 0.5  |      | add cl, 0xfd
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovd k1, ecx
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmppd k1{k1}, zmm4, zmm0, 0x1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubpd zmm4, zmm27, zmm31
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm14{k1}, zmm3, zmm21
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm21, zmm4, zmm4
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm21, zmm17, zmm17
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm21, zmm19, zmm19
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm10{k1}, zmm3, zmm20
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14pd zmm20, zmm21
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm6{k1}, zmm3, zmm18
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm3, zmm20, zmm22
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm18, zmm20, zmm20
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm3, zmm18, zmm3
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm18, zmm1, zmm20
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm18, zmm18, zmm3
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vaddpd zmm3, zmm3, zmm2
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm3, zmm18, zmm3
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea eax, ptr [rdx*4]
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | mov ecx, ebx
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl cl, 0x6
 | 
			
		||||
|   1      |             | 0.5  |             |             |      |      | 0.5  |      | sub al, cl
 | 
			
		||||
|   1      |             | 0.5  |             |             |      |      | 0.5  |      | add al, 0xfb
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovd k1, eax
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmppd k1{k1}, zmm21, zmm0, 0x1
 | 
			
		||||
|   1      |             |      |             | 1.0     1.0 |      |      |      |      | vmovupd zmm18, zmmword ptr [rsp+0x180]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubpd zmm18, zmm18, zmm29
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubpd zmm20, zmm24, zmm30
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubpd zmm21, zmm28, zmm31
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm16{k1}, zmm3, zmm19
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm19, zmm21, zmm21
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm19, zmm20, zmm20
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm19, zmm18, zmm18
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm11{k1}, zmm3, zmm17
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14pd zmm17, zmm19
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm7{k1}, zmm3, zmm4
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm3, zmm17, zmm22
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm4, zmm17, zmm17
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm3, zmm4, zmm3
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm4, zmm1, zmm17
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm4, zmm4, zmm3
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vaddpd zmm3, zmm3, zmm2
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm3, zmm4, zmm3
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl dl, 0x3
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl bl, 0x7
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | sub dl, bl
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | add dl, 0xf7
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovd k1, edx
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmppd k1{k1}, zmm19, zmm0, 0x1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm13{k1}, zmm3, zmm18
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm9{k1}, zmm3, zmm20
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm5{k1}, zmm3, zmm21
 | 
			
		||||
|   1      |             | 0.5  |             |             |      |      | 0.5  |      | inc r14
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | cmp r11, r14
 | 
			
		||||
|   0*F    |             |      |             |             |      |      |      |      | jnz 0xfffffffffffffd99
 | 
			
		||||
Total Num Of Uops: 123
 | 
			
		||||
Analysis Notes:
 | 
			
		||||
Backend allocation was stalled due to unavailable allocation resources.
 | 
			
		||||
@@ -1,159 +0,0 @@
 | 
			
		||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
 | 
			
		||||
Analyzed file:      gromacs-avx512-dp-ICX.s
 | 
			
		||||
Architecture:       CSX
 | 
			
		||||
Timestamp:          2023-01-03 00:07:20
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
 | 
			
		||||
 * - Instruction micro-ops not bound to a port
 | 
			
		||||
 X - No throughput/latency information for this instruction in data file
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Combined Analysis Report
 | 
			
		||||
------------------------
 | 
			
		||||
                                      Port pressure in cycles                                       
 | 
			
		||||
     |  0   - 0DV  |  1   |  2   -  2D  |  3   -  3D  |  4   |  5   |  6   |  7   ||  CP  | LCD  |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
2287 |             |      |             |             |      |      |      |      ||      |      |   .LBB5_11:                               #
 | 
			
		||||
2288 |             |      |             |             |      |      |      |      ||      |      |   #   Parent Loop BB5_6 Depth=1
 | 
			
		||||
2289 |             |      |             |             |      |      |      |      ||      |      |   # =>  This Inner Loop Header: Depth=2
 | 
			
		||||
2290 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||  4.0 |      |   movslq (%r12,%r14,4), %rbx
 | 
			
		||||
2291 |             | 1.00 |             |             |      | 0.00 |      |      ||  1.0 |      |   leaq (%rbx,%rbx,2), %rcx
 | 
			
		||||
2292 | 0.00        |      |             |             |      |      | 1.00 |      ||  1.0 |      |   shlq $6, %rcx
 | 
			
		||||
2293 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovapd (%rsi,%rcx), %zmm29
 | 
			
		||||
2294 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovapd 64(%rsi,%rcx), %zmm30
 | 
			
		||||
2295 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||  0.0 |      |   vmovapd 128(%rsi,%rcx), %zmm31
 | 
			
		||||
2296 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovupd 64(%rsp), %zmm3         # 64-byte Reload
 | 
			
		||||
2297 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd %zmm29, %zmm3, %zmm4
 | 
			
		||||
2298 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovupd 320(%rsp), %zmm3        # 64-byte Reload
 | 
			
		||||
2299 | 0.75        |      |             |             |      | 0.25 |      |      ||      |      |   vsubpd %zmm30, %zmm3, %zmm3
 | 
			
		||||
2300 |             | 1.00 |             |             |      | 0.00 |      |      ||      |      |   leal (%rbx,%rbx), %ecx
 | 
			
		||||
2301 | 0.00        | 1.00 |             |             |      | 0.00 | 0.00 |      ||      |      |   cmpq %rcx, %rdi
 | 
			
		||||
2302 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   setne %dl
 | 
			
		||||
2303 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   sete %cl
 | 
			
		||||
2304 |             | 1.00 |             |             |      |      |      |      ||      |      |   leal 1(%rbx,%rbx), %ebx
 | 
			
		||||
2305 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vsubpd %zmm31, %zmm25, %zmm17
 | 
			
		||||
2306 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd %zmm17, %zmm17, %zmm18
 | 
			
		||||
2307 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vfmadd231pd %zmm3, %zmm3, %zmm18 # zmm18 = (zmm3 * zmm3) + zmm18
 | 
			
		||||
2308 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vfmadd231pd %zmm4, %zmm4, %zmm18 # zmm18 = (zmm4 * zmm4) + zmm18
 | 
			
		||||
2309 | 2.75        |      |             |             |      | 0.25 |      |      ||  8.0 |      |   vrcp14pd %zmm18, %zmm19
 | 
			
		||||
2310 | 0.00        | 0.75 |             |             |      | 0.00 | 0.25 |      ||      |      |   cmpq %rbx, %rdi
 | 
			
		||||
2311 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   sete %bl
 | 
			
		||||
2312 | 0.00        | 0.75 |             |             |      | 0.00 | 0.25 |      ||      |      |   movl %ebx, %ebp
 | 
			
		||||
2313 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd %zmm22, %zmm19, %zmm20
 | 
			
		||||
2314 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm19, %zmm19, %zmm21
 | 
			
		||||
2315 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd %zmm20, %zmm21, %zmm20
 | 
			
		||||
2316 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovupd 128(%rsp), %zmm21       # 64-byte Reload
 | 
			
		||||
2317 | 0.75        |      |             |             |      | 0.25 |      |      ||      |      |   vsubpd %zmm29, %zmm21, %zmm21
 | 
			
		||||
2318 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   shlb $4, %bpl
 | 
			
		||||
2319 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm19, %zmm1, %zmm19
 | 
			
		||||
2320 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd %zmm20, %zmm19, %zmm19
 | 
			
		||||
2321 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vaddpd %zmm2, %zmm20, %zmm20
 | 
			
		||||
2322 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd %zmm20, %zmm19, %zmm19
 | 
			
		||||
2323 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovupd 256(%rsp), %zmm20       # 64-byte Reload
 | 
			
		||||
2324 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd %zmm30, %zmm20, %zmm20
 | 
			
		||||
2325 | 0.00        | 0.50 |             |             |      | 0.00 | 0.50 |      ||      |      |   notb %bpl
 | 
			
		||||
2326 | 0.00        | 0.25 |             |             |      | 0.00 | 0.75 |      ||      |      |   subb %cl, %bpl
 | 
			
		||||
2327 | 1.00        |      |             |             |      |      |      |      ||      |      |   kmovd %ebp, %k1
 | 
			
		||||
2328 |             |      |             |             |      |      |      |      ||      |      | X vcmpltpd %zmm0, %zmm18, %k1 {%k1}
 | 
			
		||||
2329 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd %zmm31, %zmm26, %zmm18
 | 
			
		||||
2330 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vfmadd231pd %zmm4, %zmm19, %zmm15 {%k1} # zmm15 = (zmm19 * zmm4) + zmm15
 | 
			
		||||
2331 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm18, %zmm18, %zmm4
 | 
			
		||||
2332 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm20, %zmm20, %zmm4 # zmm4 = (zmm20 * zmm20) + zmm4
 | 
			
		||||
2333 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm21, %zmm21, %zmm4 # zmm4 = (zmm21 * zmm21) + zmm4
 | 
			
		||||
2334 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm3, %zmm19, %zmm12 {%k1} # zmm12 = (zmm19 * zmm3) + zmm12
 | 
			
		||||
2335 | 2.25        |      |             |             |      | 0.75 |      |      ||      |      |   vrcp14pd %zmm4, %zmm3
 | 
			
		||||
2336 |             | 1.00 |             |             |      | 0.00 |      |      ||      |      |   leal (%rdx,%rdx), %ecx
 | 
			
		||||
2337 | 0.00        | 0.75 |             |             |      | 0.00 | 0.25 |      ||      |      |   movl %ebx, %eax
 | 
			
		||||
2338 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm17, %zmm19, %zmm8 {%k1} # zmm8 = (zmm19 * zmm17) + zmm8
 | 
			
		||||
2339 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm22, %zmm3, %zmm17
 | 
			
		||||
2340 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm3, %zmm3, %zmm19
 | 
			
		||||
2341 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm17, %zmm19, %zmm17
 | 
			
		||||
2342 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovupd 448(%rsp), %zmm19       # 64-byte Reload
 | 
			
		||||
2343 | 0.75        |      |             |             |      | 0.25 |      |      ||      |      |   vsubpd %zmm29, %zmm19, %zmm19
 | 
			
		||||
2344 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   shlb $5, %al
 | 
			
		||||
2345 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm3, %zmm1, %zmm3
 | 
			
		||||
2346 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm17, %zmm3, %zmm3
 | 
			
		||||
2347 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vaddpd %zmm2, %zmm17, %zmm17
 | 
			
		||||
2348 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm17, %zmm3, %zmm3
 | 
			
		||||
2349 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd %zmm30, %zmm23, %zmm17
 | 
			
		||||
2350 | 0.00        | 0.50 |             |             |      | 0.00 | 0.50 |      ||      |      |   subb %al, %cl
 | 
			
		||||
2351 | 0.00        | 0.25 |             |             |      | 0.00 | 0.75 |      ||      |      |   addb $-3, %cl
 | 
			
		||||
2352 | 1.00        |      |             |             |      |      |      |      ||      |      |   kmovd %ecx, %k1
 | 
			
		||||
2353 |             |      |             |             |      |      |      |      ||      |      | X vcmpltpd %zmm0, %zmm4, %k1 {%k1}
 | 
			
		||||
2354 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd %zmm31, %zmm27, %zmm4
 | 
			
		||||
2355 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm21, %zmm3, %zmm14 {%k1} # zmm14 = (zmm3 * zmm21) + zmm14
 | 
			
		||||
2356 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm4, %zmm4, %zmm21
 | 
			
		||||
2357 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm17, %zmm17, %zmm21 # zmm21 = (zmm17 * zmm17) + zmm21
 | 
			
		||||
2358 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm19, %zmm19, %zmm21 # zmm21 = (zmm19 * zmm19) + zmm21
 | 
			
		||||
2359 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm20, %zmm3, %zmm10 {%k1} # zmm10 = (zmm3 * zmm20) + zmm10
 | 
			
		||||
2360 | 2.50        |      |             |             |      | 0.50 |      |      ||      |      |   vrcp14pd %zmm21, %zmm20
 | 
			
		||||
2361 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm18, %zmm3, %zmm6 {%k1} # zmm6 = (zmm3 * zmm18) + zmm6
 | 
			
		||||
2362 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm22, %zmm20, %zmm3
 | 
			
		||||
2363 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm20, %zmm20, %zmm18
 | 
			
		||||
2364 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm3, %zmm18, %zmm3
 | 
			
		||||
2365 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm20, %zmm1, %zmm18
 | 
			
		||||
2366 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm3, %zmm18, %zmm18
 | 
			
		||||
2367 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vaddpd %zmm2, %zmm3, %zmm3
 | 
			
		||||
2368 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm3, %zmm18, %zmm3
 | 
			
		||||
2369 |             | 1.00 |             |             |      | 0.00 |      |      ||      |      |   leal (,%rdx,4), %eax
 | 
			
		||||
2370 | 0.00        | 0.75 |             |             |      | 0.00 | 0.25 |      ||      |      |   movl %ebx, %ecx
 | 
			
		||||
2371 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   shlb $6, %cl
 | 
			
		||||
2372 | 0.00        | 0.50 |             |             |      | 0.00 | 0.50 |      ||      |      |   subb %cl, %al
 | 
			
		||||
2373 | 0.00        | 1.00 |             |             |      | 0.00 | 0.00 |      ||      |      |   addb $-5, %al
 | 
			
		||||
2374 | 1.00        |      |             |             |      |      |      |      ||      |      |   kmovd %eax, %k1
 | 
			
		||||
2375 |             |      |             |             |      |      |      |      ||      |      | X vcmpltpd %zmm0, %zmm21, %k1 {%k1}
 | 
			
		||||
2376 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovupd 384(%rsp), %zmm18       # 64-byte Reload
 | 
			
		||||
2377 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd %zmm29, %zmm18, %zmm18
 | 
			
		||||
2378 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd %zmm30, %zmm24, %zmm20
 | 
			
		||||
2379 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd %zmm31, %zmm28, %zmm21
 | 
			
		||||
2380 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm19, %zmm3, %zmm16 {%k1} # zmm16 = (zmm3 * zmm19) + zmm16
 | 
			
		||||
2381 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm21, %zmm21, %zmm19
 | 
			
		||||
2382 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm20, %zmm20, %zmm19 # zmm19 = (zmm20 * zmm20) + zmm19
 | 
			
		||||
2383 | 0.25        |      |             |             |      | 0.75 |      |      ||      |      |   vfmadd231pd %zmm18, %zmm18, %zmm19 # zmm19 = (zmm18 * zmm18) + zmm19
 | 
			
		||||
2384 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vfmadd231pd %zmm17, %zmm3, %zmm11 {%k1} # zmm11 = (zmm3 * zmm17) + zmm11
 | 
			
		||||
2385 | 2.00        |      |             |             |      | 1.00 |      |      ||      |      |   vrcp14pd %zmm19, %zmm17
 | 
			
		||||
2386 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vfmadd231pd %zmm4, %zmm3, %zmm7 {%k1} # zmm7 = (zmm3 * zmm4) + zmm7
 | 
			
		||||
2387 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vmulpd %zmm22, %zmm17, %zmm3
 | 
			
		||||
2388 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vmulpd %zmm17, %zmm17, %zmm4
 | 
			
		||||
2389 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vmulpd %zmm3, %zmm4, %zmm3
 | 
			
		||||
2390 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vmulpd %zmm17, %zmm1, %zmm4
 | 
			
		||||
2391 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vmulpd %zmm3, %zmm4, %zmm4
 | 
			
		||||
2392 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vaddpd %zmm2, %zmm3, %zmm3
 | 
			
		||||
2393 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vmulpd %zmm3, %zmm4, %zmm3
 | 
			
		||||
2394 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   shlb $3, %dl
 | 
			
		||||
2395 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   shlb $7, %bl
 | 
			
		||||
2396 | 0.00        | 0.50 |             |             |      | 0.00 | 0.50 |      ||      |      |   subb %bl, %dl
 | 
			
		||||
2397 | 0.00        | 0.50 |             |             |      | 0.00 | 0.50 |      ||      |      |   addb $-9, %dl
 | 
			
		||||
2398 | 1.00        |      |             |             |      |      |      |      ||      |      |   kmovd %edx, %k1
 | 
			
		||||
2399 |             |      |             |             |      |      |      |      ||      |      | X vcmpltpd %zmm0, %zmm19, %k1 {%k1}
 | 
			
		||||
2400 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vfmadd231pd %zmm18, %zmm3, %zmm13 {%k1} # zmm13 = (zmm3 * zmm18) + zmm13
 | 
			
		||||
2401 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vfmadd231pd %zmm20, %zmm3, %zmm9 {%k1} # zmm9 = (zmm3 * zmm20) + zmm9
 | 
			
		||||
2402 | 0.00        |      |             |             |      | 1.00 |      |      ||      |  4.0 |   vfmadd231pd %zmm21, %zmm3, %zmm5 {%k1} # zmm5 = (zmm3 * zmm21) + zmm5
 | 
			
		||||
2403 | 0.00        | 0.50 |             |             |      | 0.00 | 0.50 |      ||      |      |   incq %r14
 | 
			
		||||
2404 | 0.00        | 1.00 |             |             |      | 0.00 | 0.00 |      ||      |      |   cmpq %r14, %r11
 | 
			
		||||
2405 |             |      |             |             |      |      |      |      ||      |      | * jne .LBB5_11
 | 
			
		||||
 | 
			
		||||
       40.0          14.5   5.00   5.00   5.00   5.00          40.0   14.5           50.0    4.0  
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Loop-Carried Dependencies Analysis Report
 | 
			
		||||
-----------------------------------------
 | 
			
		||||
2402 |  4.0 | vfmadd231pd	%zmm21, %zmm3, %zmm5 {%k1} # zmm5 = (zmm3 * zmm21) + zmm5| [2402]
 | 
			
		||||
2401 |  4.0 | vfmadd231pd	%zmm20, %zmm3, %zmm9 {%k1} # zmm9 = (zmm3 * zmm20) + zmm9| [2401]
 | 
			
		||||
2400 |  4.0 | vfmadd231pd	%zmm18, %zmm3, %zmm13 {%k1} # zmm13 = (zmm3 * zmm18) + zmm13| [2400]
 | 
			
		||||
2386 |  4.0 | vfmadd231pd	%zmm4, %zmm3, %zmm7 {%k1} # zmm7 = (zmm3 * zmm4) + zmm7| [2386]
 | 
			
		||||
2384 |  4.0 | vfmadd231pd	%zmm17, %zmm3, %zmm11 {%k1} # zmm11 = (zmm3 * zmm17) + zmm11| [2384]
 | 
			
		||||
2380 |  4.0 | vfmadd231pd	%zmm19, %zmm3, %zmm16 {%k1} # zmm16 = (zmm3 * zmm19) + zmm16| [2380]
 | 
			
		||||
2361 |  4.0 | vfmadd231pd	%zmm18, %zmm3, %zmm6 {%k1} # zmm6 = (zmm3 * zmm18) + zmm6| [2361]
 | 
			
		||||
2359 |  4.0 | vfmadd231pd	%zmm20, %zmm3, %zmm10 {%k1} # zmm10 = (zmm3 * zmm20) + zmm10| [2359]
 | 
			
		||||
2355 |  4.0 | vfmadd231pd	%zmm21, %zmm3, %zmm14 {%k1} # zmm14 = (zmm3 * zmm21) + zmm14| [2355]
 | 
			
		||||
2338 |  4.0 | vfmadd231pd	%zmm17, %zmm19, %zmm8 {%k1} # zmm8 = (zmm19 * zmm17) + zmm8| [2338]
 | 
			
		||||
2334 |  4.0 | vfmadd231pd	%zmm3, %zmm19, %zmm12 {%k1} # zmm12 = (zmm19 * zmm3) + zmm12| [2334]
 | 
			
		||||
2330 |  4.0 | vfmadd231pd	%zmm4, %zmm19, %zmm15 {%k1} # zmm15 = (zmm19 * zmm4) + zmm15| [2330]
 | 
			
		||||
2394 |  3.0 | shlb	$3, %dl                        | [2394, 2396, 2397]
 | 
			
		||||
2318 |  3.0 | shlb	$4, %bpl                       | [2318, 2325, 2326]
 | 
			
		||||
2403 |  1.0 | incq	%r14                           | [2403]
 | 
			
		||||
 | 
			
		||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -1,198 +0,0 @@
 | 
			
		||||
Intel(R) Architecture Code Analyzer Version -  v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
 | 
			
		||||
Analyzed File -  gromacs-icc-avx512-dp.o
 | 
			
		||||
Binary Format - 64Bit
 | 
			
		||||
Architecture  -  SKX
 | 
			
		||||
Analysis Type - Throughput
 | 
			
		||||
 | 
			
		||||
Throughput Analysis Report
 | 
			
		||||
--------------------------
 | 
			
		||||
Block Throughput: 62.00 Cycles       Throughput Bottleneck: Backend
 | 
			
		||||
Loop Count:  22
 | 
			
		||||
Port Binding In Cycles Per Iteration:
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
|  Port  |   0   -  DV   |   1   |   2   -  D    |   3   -  D    |   4   |   5   |   6   |   7   |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
| Cycles | 58.0     0.0  | 16.0  | 16.0    15.0  | 16.0    15.0  |  2.0  | 58.0  | 16.0  |  0.0  |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
DV - Divider pipe (on port 0)
 | 
			
		||||
D - Data fetch pipe (on ports 2 and 3)
 | 
			
		||||
F - Macro Fusion with the previous instruction occurred
 | 
			
		||||
* - instruction micro-ops not bound to a port
 | 
			
		||||
^ - Micro Fusion occurred
 | 
			
		||||
# - ESP Tracking sync uop was issued
 | 
			
		||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
 | 
			
		||||
X - instruction not supported, was not accounted in Analysis
 | 
			
		||||
 | 
			
		||||
| Num Of   |                    Ports pressure in cycles                         |      |
 | 
			
		||||
|  Uops    |  0  - DV    |  1   |  2  -  D    |  3  -  D    |  4   |  5   |  6   |  7   |
 | 
			
		||||
-----------------------------------------------------------------------------------------
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | mov edx, dword ptr [r10+rsi*4]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | inc rsi
 | 
			
		||||
|   1      |             |      |             | 1.0     1.0 |      |      |      |      | vmovups zmm20, zmmword ptr [rsp+0x380]
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | vmovups zmm25, zmmword ptr [rsp+0x340]
 | 
			
		||||
|   1      |             |      |             | 1.0     1.0 |      |      |      |      | vmovups zmm24, zmmword ptr [rsp+0x1c0]
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | vmovups zmm23, zmmword ptr [rsp+0x2c0]
 | 
			
		||||
|   1      |             |      |             | 1.0     1.0 |      |      |      |      | vmovups zmm16, zmmword ptr [rsp+0x3c0]
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | vmovups zmm14, zmmword ptr [rsp+0x300]
 | 
			
		||||
|   1      |             |      |             | 1.0     1.0 |      |      |      |      | vmovups zmm15, zmmword ptr [rsp+0x240]
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | vmovups zmm12, zmmword ptr [rsp+0x180]
 | 
			
		||||
|   1      |             |      |             | 1.0     1.0 |      |      |      |      | vmovups zmm21, zmmword ptr [rsp+0x200]
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | vmovups zmm18, zmmword ptr [rsp+0x140]
 | 
			
		||||
|   1      |             |      |             | 1.0     1.0 |      |      |      |      | vmovups zmm22, zmmword ptr [rsp+0x100]
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | vmovups zmm17, zmmword ptr [rsp+0x280]
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea r12d, ptr [rdx+rdx*2]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl r12d, 0x3
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea r13d, ptr [rdx+rdx*1]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | movsxd r12, r12d
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | cmp r13d, r11d
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea eax, ptr [rdx+rdx*1+0x1]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | mov edx, 0x0
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | setz dl
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | cmp eax, r11d
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | mov eax, 0x0
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | mov r13d, edx
 | 
			
		||||
|   2      | 1.0         |      |             | 1.0     1.0 |      |      |      |      | vsubpd zmm29, zmm20, zmmword ptr [r8+r12*8+0x80]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | setz al
 | 
			
		||||
|   2      |             |      | 1.0     1.0 |             |      | 1.0  |      |      | vsubpd zmm26, zmm25, zmmword ptr [r8+r12*8+0x80]
 | 
			
		||||
|   2      | 1.0         |      |             | 1.0     1.0 |      |      |      |      | vsubpd zmm25, zmm24, zmmword ptr [r8+r12*8]
 | 
			
		||||
|   2      |             |      | 1.0     1.0 |             |      | 1.0  |      |      | vsubpd zmm24, zmm23, zmmword ptr [r8+r12*8+0x40]
 | 
			
		||||
|   2      | 1.0         |      |             | 1.0     1.0 |      |      |      |      | vsubpd zmm23, zmm16, zmmword ptr [r8+r12*8+0x80]
 | 
			
		||||
|   2      |             |      | 1.0     1.0 |             |      | 1.0  |      |      | vsubpd zmm20, zmm14, zmmword ptr [r8+r12*8+0x80]
 | 
			
		||||
|   2      | 1.0         |      |             | 1.0     1.0 |      |      |      |      | vsubpd zmm27, zmm12, zmmword ptr [r8+r12*8+0x40]
 | 
			
		||||
|   2      |             |      | 1.0     1.0 |             |      | 1.0  |      |      | vsubpd zmm28, zmm15, zmmword ptr [r8+r12*8]
 | 
			
		||||
|   2      | 1.0         |      |             | 1.0     1.0 |      |      |      |      | vsubpd zmm30, zmm21, zmmword ptr [r8+r12*8+0x40]
 | 
			
		||||
|   2      |             |      | 1.0     1.0 |             |      | 1.0  |      |      | vsubpd zmm21, zmm18, zmmword ptr [r8+r12*8+0x40]
 | 
			
		||||
|   2      | 1.0         |      |             | 1.0     1.0 |      |      |      |      | vsubpd zmm31, zmm22, zmmword ptr [r8+r12*8]
 | 
			
		||||
|   2      |             |      | 1.0     1.0 |             |      | 1.0  |      |      | vsubpd zmm22, zmm17, zmmword ptr [r8+r12*8]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm13, zmm29, zmm29
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm15, zmm26, zmm26
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm12, zmm23, zmm23
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm14, zmm20, zmm20
 | 
			
		||||
|   1      |             |      |             | 1.0     1.0 |      |      |      |      | vmovups zmm16, zmmword ptr [rsp+0xc0]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm13, zmm30, zmm30
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm15, zmm27, zmm27
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm12, zmm24, zmm24
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm14, zmm21, zmm21
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm13, zmm31, zmm31
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm15, zmm28, zmm28
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm12, zmm25, zmm25
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm14, zmm22, zmm22
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14pd zmm19, zmm13
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14pd zmm18, zmm15
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14pd zmm17, zmm12
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmppd k1, zmm13, zmm16, 0x11
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmppd k6, zmm15, zmm16, 0x11
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmppd k7, zmm12, zmm16, 0x11
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmppd k0, zmm14, zmm16, 0x11
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14pd zmm15, zmm14
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | vmovups zmm16, zmmword ptr [rsp+0x40]
 | 
			
		||||
|   1      |             |      |             | 1.0     1.0 |      |      |      |      | vmovups zmm12, zmmword ptr [rsp+0x80]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm13, zmm19, zmm16
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm13, zmm19, zmm13
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | neg r13d
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm14, zmm19, zmm13
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | mov r12d, eax
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmsub213pd zmm13, zmm19, zmm1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm19, zmm19, zmm12
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm13, zmm13, zmm19
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | add r13d, 0xff
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm14, zmm14, zmm13
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | nop 
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | vmovups zmm13, zmmword ptr [rsp+0x400]
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm19, zmm10, zmm14
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl r12d, 0x4
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | sub r13d, r12d
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovb k5, r13d
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kmovw r13d, k1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kmovb r12d, k5
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovb k5, r12d
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovb k1, r13d
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | mov r13d, eax
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kandb k5, k5, k1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kmovb r12d, k5
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovw k5, r12d
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea r12d, ptr [rdx+rdx*1]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm9{k5}, zmm19, zmm29
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | neg r12d
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm13{k5}, zmm19, zmm31
 | 
			
		||||
|   1      |             |      |             | 1.0     1.0 |      |      |      |      | vmovups zmm31, zmmword ptr [rsp+0x440]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm29, zmm18, zmm16
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm31{k5}, zmm19, zmm30
 | 
			
		||||
|   2^     |             |      | 1.0         |             | 1.0  |      |      |      | vmovups zmmword ptr [rsp+0x400], zmm13
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm30, zmm18, zmm29
 | 
			
		||||
|   2^     |             |      |             | 1.0         | 1.0  |      |      |      | vmovups zmmword ptr [rsp+0x440], zmm31
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm13, zmm18, zmm30
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmsub213pd zmm30, zmm18, zmm1
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm18, zmm18, zmm12
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm14, zmm30, zmm18
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | add r12d, 0xff
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm19, zmm13, zmm14
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm29, zmm10, zmm19
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl r13d, 0x5
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | sub r12d, r13d
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovb k1, r12d
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kmovw r12d, k6
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kmovb r13d, k1
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovb k1, r13d
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovb k6, r12d
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | mov r12d, eax
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kandb k1, k1, k6
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kmovb r13d, k1
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovw k1, r13d
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea r13d, ptr [rdx*4]
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm6{k1}, zmm29, zmm26
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | neg r13d
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm7{k1}, zmm29, zmm27
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm8{k1}, zmm29, zmm28
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm26, zmm17, zmm16
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm28, zmm17, zmm12
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm16, zmm15, zmm16
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm12, zmm15, zmm12
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm27, zmm17, zmm26
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm19, zmm15, zmm16
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm13, zmm17, zmm27
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmsub213pd zmm27, zmm17, zmm1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm14, zmm27, zmm28
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | add r13d, 0xff
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm17, zmm13, zmm14
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl edx, 0x3
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl r12d, 0x6
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | neg edx
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm18, zmm10, zmm17
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | sub r13d, r12d
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovb k6, r13d
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | add edx, 0xff
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl eax, 0x7
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | sub edx, eax
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kmovb eax, k6
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovb k6, eax
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kmovw eax, k7
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovb k7, eax
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kandb k7, k6, k7
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovb k6, edx
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kmovb edx, k7
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovw k7, edx
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kmovw edx, k0
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm11{k7}, zmm18, zmm23
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm4{k7}, zmm18, zmm24
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm5{k7}, zmm18, zmm25
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm23, zmm15, zmm19
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmsub213pd zmm19, zmm15, zmm1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm15, zmm19, zmm12
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm24, zmm23, zmm15
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm25, zmm10, zmm24
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kmovb eax, k6
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovb k6, eax
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovb k0, edx
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kandb k0, k6, k0
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kmovb r12d, k0
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovw k6, r12d
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm3{k6}, zmm25, zmm22
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm2{k6}, zmm25, zmm21
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm0{k6}, zmm25, zmm20
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | cmp rsi, rdi
 | 
			
		||||
|   0*F    |             |      |             |             |      |      |      |      | jl 0xfffffffffffffc6f
 | 
			
		||||
Total Num Of Uops: 187
 | 
			
		||||
Analysis Notes:
 | 
			
		||||
Backend allocation was stalled due to unavailable allocation resources.
 | 
			
		||||
@@ -1,152 +0,0 @@
 | 
			
		||||
Intel(R) Architecture Code Analyzer Version -  v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
 | 
			
		||||
Analyzed File -  gromacs-icc-avx512-sp.o
 | 
			
		||||
Binary Format - 64Bit
 | 
			
		||||
Architecture  -  SKX
 | 
			
		||||
Analysis Type - Throughput
 | 
			
		||||
 | 
			
		||||
Throughput Analysis Report
 | 
			
		||||
--------------------------
 | 
			
		||||
Block Throughput: 51.00 Cycles       Throughput Bottleneck: Backend
 | 
			
		||||
Loop Count:  22
 | 
			
		||||
Port Binding In Cycles Per Iteration:
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
|  Port  |   0   -  DV   |   1   |   2   -  D    |   3   -  D    |   4   |   5   |   6   |   7   |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
| Cycles | 47.5     0.0  |  9.0  | 11.0    11.0  | 11.0     8.0  |  3.0  | 47.5  |  9.0  |  0.0  |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
DV - Divider pipe (on port 0)
 | 
			
		||||
D - Data fetch pipe (on ports 2 and 3)
 | 
			
		||||
F - Macro Fusion with the previous instruction occurred
 | 
			
		||||
* - instruction micro-ops not bound to a port
 | 
			
		||||
^ - Micro Fusion occurred
 | 
			
		||||
# - ESP Tracking sync uop was issued
 | 
			
		||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
 | 
			
		||||
X - instruction not supported, was not accounted in Analysis
 | 
			
		||||
 | 
			
		||||
| Num Of   |                    Ports pressure in cycles                         |      |
 | 
			
		||||
|  Uops    |  0  - DV    |  1   |  2  -  D    |  3  -  D    |  4   |  5   |  6   |  7   |
 | 
			
		||||
-----------------------------------------------------------------------------------------
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | mov edi, dword ptr [rcx+rax*4]
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | mov r12d, r13d
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | movsxd rdi, edi
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | inc rax
 | 
			
		||||
|   1      |             |      |             | 1.0     1.0 |      |      |      |      | vmovups zmm10, zmmword ptr [rsp+0x140]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | test edi, 0x7fffffff
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | vmovups zmm11, zmmword ptr [rsp+0x100]
 | 
			
		||||
|   1      |             |      |             | 1.0     1.0 |      |      |      |      | vmovups zmm9, zmmword ptr [rsp+0xc0]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | setz r12b
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea r14, ptr [rdi+rdi*2]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl r14, 0x5
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | mov r8d, r12d
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | neg r8d
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | mov r11d, r12d
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | add r8d, 0xff
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovw k0, r8d
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea r9d, ptr [r12+r12*2]
 | 
			
		||||
|   2      | 1.0         |      | 1.0     1.0 |             |      |      |      |      | vsubps zmm3, zmm13, zmmword ptr [r14+rbx*1+0x40]
 | 
			
		||||
|   2      | 0.5         |      |             | 1.0     1.0 |      | 0.5  |      |      | vsubps zmm4, zmm10, zmmword ptr [r14+rbx*1]
 | 
			
		||||
|   2      | 0.5         |      | 1.0     1.0 |             |      | 0.5  |      |      | vsubps zmm10, zmm11, zmmword ptr [r14+rbx*1+0x40]
 | 
			
		||||
|   2      | 0.5         |      |             | 1.0     1.0 |      | 0.5  |      |      | vsubps zmm5, zmm17, zmmword ptr [r14+rbx*1+0x20]
 | 
			
		||||
|   2      | 0.5         |      | 1.0     1.0 |             |      | 0.5  |      |      | vsubps zmm27, zmm19, zmmword ptr [r14+rbx*1]
 | 
			
		||||
|   2      | 0.5         |      |             | 1.0     1.0 |      | 0.5  |      |      | vsubps zmm8, zmm15, zmmword ptr [r14+rbx*1+0x20]
 | 
			
		||||
|   2      | 0.5         |      | 1.0     1.0 |             |      | 0.5  |      |      | vsubps zmm11, zmm0, zmmword ptr [r14+rbx*1+0x40]
 | 
			
		||||
|   2      | 0.5         |      |             | 1.0     1.0 |      | 0.5  |      |      | vsubps zmm7, zmm9, zmmword ptr [r14+rbx*1]
 | 
			
		||||
|   2      | 0.5         |      | 1.0     1.0 |             |      | 0.5  |      |      | vsubps zmm9, zmm14, zmmword ptr [r14+rbx*1+0x20]
 | 
			
		||||
|   2      | 0.5         |      |             | 1.0     1.0 |      | 0.5  |      |      | vsubps zmm29, zmm12, zmmword ptr [r14+rbx*1+0x40]
 | 
			
		||||
|   2      | 0.5         |      | 1.0     1.0 |             |      | 0.5  |      |      | vsubps zmm28, zmm16, zmmword ptr [r14+rbx*1+0x20]
 | 
			
		||||
|   2      | 0.5         |      |             | 1.0     1.0 |      | 0.5  |      |      | vsubps zmm25, zmm18, zmmword ptr [r14+rbx*1]
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm2, zmm3, zmm3
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm30, zmm10, zmm10
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm1, zmm11, zmm11
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm26, zmm29, zmm29
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm2, zmm5, zmm5
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm30, zmm8, zmm8
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm1, zmm9, zmm9
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm26, zmm28, zmm28
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm2, zmm27, zmm27
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm30, zmm4, zmm4
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm1, zmm7, zmm7
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm26, zmm25, zmm25
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14ps zmm31, zmm2
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmpps k7, zmm30, zmm24, 0x11
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14ps zmm6, zmm30
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmpps k3, zmm2, zmm24, 0x11
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14ps zmm2, zmm1
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmpps k5, zmm26, zmm24, 0x11
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14ps zmm26, zmm26
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm30, zmm31, zmm23
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kandw k2, k0, k3
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmpps k3, zmm1, zmm24, 0x11
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm1, zmm31, zmm30
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm30, zmm31, zmm1
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | neg r9d
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmsub213ps zmm1, zmm31, zmm20
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm31, zmm31, zmm22
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm31, zmm1, zmm31
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | add r9d, 0xff
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovw k4, r9d
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm30, zmm30, zmm31
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kandw k1, k4, k5
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm1, zmm21, zmm30
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm30, zmm26, zmm23
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm31, zmm26, zmm30
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea r10d, ptr [r12*8]
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm30, zmm26, zmm31
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | neg r10d
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmsub213ps zmm31, zmm26, zmm20
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm26, zmm26, zmm22
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm31, zmm31, zmm26
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | add r10d, r12d
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm30, zmm30, zmm31
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | add r10d, 0xff
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovw k6, r10d
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm26, zmm21, zmm30
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kandw k4, k6, k7
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm25{k1}{z}, zmm25, zmm26
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm31{k1}{z}, zmm28, zmm26
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm28, zmm6, zmm23
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm30{k1}{z}, zmm29, zmm26
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm29, zmm2, zmm23
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm25{k2}, zmm27, zmm1
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm31{k2}, zmm5, zmm1
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm27, zmm6, zmm28
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm30{k2}, zmm3, zmm1
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm1, zmm2, zmm29
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm5, zmm6, zmm27
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmsub213ps zmm27, zmm6, zmm20
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm6, zmm6, zmm22
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm3, zmm2, zmm1
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmsub213ps zmm1, zmm2, zmm20
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm2, zmm2, zmm22
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm26, zmm27, zmm6
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm1, zmm1, zmm2
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm5, zmm5, zmm26
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm3, zmm3, zmm1
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm6, zmm21, zmm5
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulps zmm27, zmm21, zmm3
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm25{k4}, zmm4, zmm6
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | vmovups zmm4, zmmword ptr [r14+rsi*1]
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm31{k4}, zmm8, zmm6
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm30{k4}, zmm10, zmm6
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl r11d, 0x4
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | sub r12d, r11d
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | add r12d, 0xff
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovw k0, r12d
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kandw k5, k0, k3
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm25{k5}, zmm7, zmm27
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm31{k5}, zmm9, zmm27
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231ps zmm30{k5}, zmm11, zmm27
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vsubps zmm7, zmm4, zmm25
 | 
			
		||||
|   2      |             |      |             | 1.0         | 1.0  |      |      |      | vmovups zmmword ptr [r14+rsi*1], zmm7
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | vmovups zmm8, zmmword ptr [r14+rsi*1+0x20]
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vsubps zmm4, zmm8, zmm31
 | 
			
		||||
|   2      |             |      |             | 1.0         | 1.0  |      |      |      | vmovups zmmword ptr [r14+rsi*1+0x20], zmm4
 | 
			
		||||
|   1      |             |      | 1.0     1.0 |             |      |      |      |      | vmovups zmm1, zmmword ptr [r14+rsi*1+0x40]
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vsubps zmm2, zmm1, zmm30
 | 
			
		||||
|   2      |             |      |             | 1.0         | 1.0  |      |      |      | vmovups zmmword ptr [r14+rsi*1+0x40], zmm2
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | cmp rax, rdx
 | 
			
		||||
|   0*F    |             |      |             |             |      |      |      |      | jb 0xfffffffffffffd30
 | 
			
		||||
Total Num Of Uops: 142
 | 
			
		||||
Analysis Notes:
 | 
			
		||||
Backend allocation was stalled due to unavailable allocation resources.
 | 
			
		||||
@@ -1,154 +0,0 @@
 | 
			
		||||
Intel(R) Architecture Code Analyzer Version -  v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
 | 
			
		||||
Analyzed File -  gromacs-icx-avx512-dp.o
 | 
			
		||||
Binary Format - 64Bit
 | 
			
		||||
Architecture  -  SKX
 | 
			
		||||
Analysis Type - Throughput
 | 
			
		||||
 | 
			
		||||
Throughput Analysis Report
 | 
			
		||||
--------------------------
 | 
			
		||||
Block Throughput: 49.26 Cycles       Throughput Bottleneck: Backend
 | 
			
		||||
Loop Count:  22
 | 
			
		||||
Port Binding In Cycles Per Iteration:
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
|  Port  |   0   -  DV   |   1   |   2   -  D    |   3   -  D    |   4   |   5   |   6   |   7   |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
| Cycles | 44.0     0.0  | 13.5  |  5.5     5.5  |  5.5     5.5  |  0.0  | 44.0  | 13.5  |  0.0  |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
DV - Divider pipe (on port 0)
 | 
			
		||||
D - Data fetch pipe (on ports 2 and 3)
 | 
			
		||||
F - Macro Fusion with the previous instruction occurred
 | 
			
		||||
* - instruction micro-ops not bound to a port
 | 
			
		||||
^ - Micro Fusion occurred
 | 
			
		||||
# - ESP Tracking sync uop was issued
 | 
			
		||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
 | 
			
		||||
X - instruction not supported, was not accounted in Analysis
 | 
			
		||||
 | 
			
		||||
| Num Of   |                    Ports pressure in cycles                         |      |
 | 
			
		||||
|  Uops    |  0  - DV    |  1   |  2  -  D    |  3  -  D    |  4   |  5   |  6   |  7   |
 | 
			
		||||
-----------------------------------------------------------------------------------------
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | movsxd rcx, dword ptr [r10+rbx*4]
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea rdx, ptr [rcx+rcx*2]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl rdx, 0x6
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovupd zmm28, zmmword ptr [rsi+rdx*1]
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovupd zmm29, zmmword ptr [rsi+rdx*1+0x40]
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovupd zmm30, zmmword ptr [rsi+rdx*1+0x80]
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovupd zmm3, zmmword ptr [rsp+0x10]
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubpd zmm3, zmm3, zmm28
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubpd zmm31, zmm24, zmm30
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovupd zmm16, zmmword ptr [rsp+0x150]
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubpd zmm16, zmm16, zmm29
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm17, zmm31, zmm31
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm17, zmm16, zmm16
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm17, zmm3, zmm3
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14pd zmm18, zmm17
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm19, zmm21, zmm18
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm19, zmm18, zmm19
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm19, zmm18, zmm19
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vaddpd zmm20, zmm19, zmm1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm18, zmm22, zmm18
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm18, zmm18, zmm20
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubpd zmm20, zmm25, zmm30
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea edx, ptr [rcx+rcx*1]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | cmp r11, rdx
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | setnz dl
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | setz al
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | add ecx, ecx
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | inc ecx
 | 
			
		||||
|   1      |             | 0.5  |             |             |      |      | 0.5  |      | cmp r11, rcx
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | setz cl
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm18, zmm19, zmm18
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovupd zmm19, zmmword ptr [rsp+0x210]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubpd zmm19, zmm19, zmm28
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | setnz dil
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | mov ebp, edi
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl bpl, 0x4
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | sub bpl, al
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | add bpl, 0xef
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovd k1, ebp
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmppd k1{k1}, zmm17, zmm0, 0x1
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovupd zmm17, zmmword ptr [rsp+0x110]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubpd zmm17, zmm17, zmm29
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea eax, ptr [rdx+rdx*1]
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | mov ebp, edi
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm18, zmm18, zmm2
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm14{k1}, zmm3, zmm18
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm3, zmm20, zmm20
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm3, zmm17, zmm17
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm3, zmm19, zmm19
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm11{k1}, zmm16, zmm18
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14pd zmm16, zmm3
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm7{k1}, zmm31, zmm18
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm18, zmm21, zmm16
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm18, zmm16, zmm18
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm18, zmm16, zmm18
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vaddpd zmm31, zmm18, zmm1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm16, zmm22, zmm16
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm16, zmm16, zmm31
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovupd zmm31, zmmword ptr [rsp+0x1d0]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubpd zmm31, zmm31, zmm28
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl bpl, 0x5
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | or bpl, al
 | 
			
		||||
|   1      |             | 0.5  |             |             |      |      | 0.5  |      | or bpl, 0xdd
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovd k1, ebp
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmppd k1{k1}, zmm3, zmm0, 0x1
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovupd zmm3, zmmword ptr [rsp+0xd0]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubpd zmm3, zmm3, zmm29
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm16, zmm18, zmm16
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubpd zmm18, zmm26, zmm30
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm16, zmm16, zmm2
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm15{k1}, zmm19, zmm16
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm19, zmm18, zmm18
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm19, zmm3, zmm3
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm19, zmm31, zmm31
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm10{k1}, zmm17, zmm16
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14pd zmm17, zmm19
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm6{k1}, zmm20, zmm16
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm16, zmm21, zmm17
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm16, zmm17, zmm16
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm16, zmm17, zmm16
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vaddpd zmm20, zmm16, zmm1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm17, zmm22, zmm17
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm17, zmm17, zmm20
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm16, zmm16, zmm17
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea eax, ptr [rdx*4]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl dil, 0x6
 | 
			
		||||
|   1      |             | 0.5  |             |             |      |      | 0.5  |      | or dil, al
 | 
			
		||||
|   1      |             | 0.5  |             |             |      |      | 0.5  |      | or dil, 0xbb
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovd k1, edi
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmppd k1{k1}, zmm19, zmm0, 0x1
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovupd zmm17, zmmword ptr [rsp+0x190]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubpd zmm17, zmm17, zmm28
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubpd zmm19, zmm23, zmm29
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubpd zmm20, zmm27, zmm30
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm16, zmm16, zmm2
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm13{k1}, zmm31, zmm16
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm28, zmm20, zmm20
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm28, zmm19, zmm19
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm28, zmm17, zmm17
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm9{k1}, zmm3, zmm16
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14pd zmm3, zmm28
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm5{k1}, zmm18, zmm16
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm16, zmm21, zmm3
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm16, zmm3, zmm16
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm16, zmm3, zmm16
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vaddpd zmm18, zmm16, zmm1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm3, zmm22, zmm3
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm3, zmm3, zmm18
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm3, zmm16, zmm3
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl dl, 0x3
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl cl, 0x7
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | or cl, dl
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | add cl, 0xf7
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovd k1, ecx
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmppd k1{k1}, zmm28, zmm0, 0x1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulpd zmm3, zmm3, zmm2
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm12{k1}, zmm17, zmm3
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231pd zmm8{k1}, zmm19, zmm3
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231pd zmm4{k1}, zmm20, zmm3
 | 
			
		||||
|   1      |             | 0.5  |             |             |      |      | 0.5  |      | inc rbx
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | cmp r9, rbx
 | 
			
		||||
|   0*F    |             |      |             |             |      |      |      |      | jnz 0xfffffffffffffd5a
 | 
			
		||||
Total Num Of Uops: 129
 | 
			
		||||
Analysis Notes:
 | 
			
		||||
Backend allocation was stalled due to unavailable allocation resources.
 | 
			
		||||
@@ -1,288 +0,0 @@
 | 
			
		||||
 | 
			
		||||
[0] Code Region
 | 
			
		||||
 | 
			
		||||
Iterations:        100
 | 
			
		||||
Instructions:      12200
 | 
			
		||||
Total Cycles:      4745
 | 
			
		||||
Total uOps:        14000
 | 
			
		||||
 | 
			
		||||
Dispatch Width:    6
 | 
			
		||||
uOps Per Cycle:    2.95
 | 
			
		||||
IPC:               2.57
 | 
			
		||||
Block RThroughput: 34.0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Instruction Info:
 | 
			
		||||
[1]: #uOps
 | 
			
		||||
[2]: Latency
 | 
			
		||||
[3]: RThroughput
 | 
			
		||||
[4]: MayLoad
 | 
			
		||||
[5]: MayStore
 | 
			
		||||
[6]: HasSideEffects (U)
 | 
			
		||||
 | 
			
		||||
[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 | 
			
		||||
 1      5     0.50    *                   movslq	(%r10,%rbx,4), %rcx
 | 
			
		||||
 1      1     0.50                        leaq	(%rcx,%rcx,2), %rdx
 | 
			
		||||
 1      1     0.50                        shlq	$6, %rdx
 | 
			
		||||
 2      8     0.50    *                   vmovupd	(%rsi,%rdx), %zmm28
 | 
			
		||||
 2      8     0.50    *                   vmovupd	64(%rsi,%rdx), %zmm29
 | 
			
		||||
 2      8     0.50    *                   vmovupd	128(%rsi,%rdx), %zmm30
 | 
			
		||||
 2      8     0.50    *                   vmovupd	16(%rsp), %zmm3
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm28, %zmm3, %zmm3
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm30, %zmm24, %zmm31
 | 
			
		||||
 2      8     0.50    *                   vmovupd	336(%rsp), %zmm16
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm29, %zmm16, %zmm16
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm31, %zmm31, %zmm17
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm16, %zmm16, %zmm17
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm3, %zmm3, %zmm17
 | 
			
		||||
 3      4     2.00                        vrcp14pd	%zmm17, %zmm18
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm18, %zmm21, %zmm19
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm19, %zmm18, %zmm19
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm19, %zmm18, %zmm19
 | 
			
		||||
 1      4     0.50                        vaddpd	%zmm1, %zmm19, %zmm20
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm18, %zmm22, %zmm18
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm20, %zmm18, %zmm18
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm30, %zmm25, %zmm20
 | 
			
		||||
 1      1     0.50                        leal	(%rcx,%rcx), %edx
 | 
			
		||||
 1      1     0.25                        cmpq	%rdx, %r11
 | 
			
		||||
 1      1     0.50                        setne	%dl
 | 
			
		||||
 1      1     0.50                        sete	%al
 | 
			
		||||
 1      1     0.25                        addl	%ecx, %ecx
 | 
			
		||||
 1      1     0.25                        incl	%ecx
 | 
			
		||||
 1      1     0.25                        cmpq	%rcx, %r11
 | 
			
		||||
 1      1     0.50                        sete	%cl
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm18, %zmm19, %zmm18
 | 
			
		||||
 2      8     0.50    *                   vmovupd	528(%rsp), %zmm19
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm28, %zmm19, %zmm19
 | 
			
		||||
 1      1     0.50                        setne	%dil
 | 
			
		||||
 1      1     0.25                        movl	%edi, %ebp
 | 
			
		||||
 1      1     0.50                        shlb	$4, %bpl
 | 
			
		||||
 1      1     0.25                        subb	%al, %bpl
 | 
			
		||||
 1      1     0.25                        addb	$-17, %bpl
 | 
			
		||||
 1      1     1.00                        kmovd	%ebp, %k1
 | 
			
		||||
 1      4     1.00                        vcmpltpd	%zmm0, %zmm17, %k1 {%k1}
 | 
			
		||||
 2      8     0.50    *                   vmovupd	272(%rsp), %zmm17
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm29, %zmm17, %zmm17
 | 
			
		||||
 1      1     0.50                        leal	(%rdx,%rdx), %eax
 | 
			
		||||
 1      1     0.25                        movl	%edi, %ebp
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm2, %zmm18, %zmm18
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm18, %zmm3, %zmm14 {%k1}
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm20, %zmm20, %zmm3
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm17, %zmm17, %zmm3
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm19, %zmm19, %zmm3
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm18, %zmm16, %zmm11 {%k1}
 | 
			
		||||
 3      4     2.00                        vrcp14pd	%zmm3, %zmm16
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm18, %zmm31, %zmm7 {%k1}
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm16, %zmm21, %zmm18
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm18, %zmm16, %zmm18
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm18, %zmm16, %zmm18
 | 
			
		||||
 1      4     0.50                        vaddpd	%zmm1, %zmm18, %zmm31
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm16, %zmm22, %zmm16
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm31, %zmm16, %zmm16
 | 
			
		||||
 2      8     0.50    *                   vmovupd	464(%rsp), %zmm31
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm28, %zmm31, %zmm31
 | 
			
		||||
 1      1     0.50                        shlb	$5, %bpl
 | 
			
		||||
 1      1     0.25                        orb	%al, %bpl
 | 
			
		||||
 1      1     0.25                        orb	$-35, %bpl
 | 
			
		||||
 1      1     1.00                        kmovd	%ebp, %k1
 | 
			
		||||
 1      4     1.00                        vcmpltpd	%zmm0, %zmm3, %k1 {%k1}
 | 
			
		||||
 2      8     0.50    *                   vmovupd	208(%rsp), %zmm3
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm29, %zmm3, %zmm3
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm16, %zmm18, %zmm16
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm30, %zmm26, %zmm18
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm2, %zmm16, %zmm16
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm16, %zmm19, %zmm15 {%k1}
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm18, %zmm18, %zmm19
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm3, %zmm3, %zmm19
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm31, %zmm31, %zmm19
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm16, %zmm17, %zmm10 {%k1}
 | 
			
		||||
 3      4     2.00                        vrcp14pd	%zmm19, %zmm17
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm16, %zmm20, %zmm6 {%k1}
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm17, %zmm21, %zmm16
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm16, %zmm17, %zmm16
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm16, %zmm17, %zmm16
 | 
			
		||||
 1      4     0.50                        vaddpd	%zmm1, %zmm16, %zmm20
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm17, %zmm22, %zmm17
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm20, %zmm17, %zmm17
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm17, %zmm16, %zmm16
 | 
			
		||||
 1      1     0.50                        leal	(,%rdx,4), %eax
 | 
			
		||||
 1      1     0.50                        shlb	$6, %dil
 | 
			
		||||
 1      1     0.25                        orb	%al, %dil
 | 
			
		||||
 1      1     0.25                        orb	$-69, %dil
 | 
			
		||||
 1      1     1.00                        kmovd	%edi, %k1
 | 
			
		||||
 1      4     1.00                        vcmpltpd	%zmm0, %zmm19, %k1 {%k1}
 | 
			
		||||
 2      8     0.50    *                   vmovupd	400(%rsp), %zmm17
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm28, %zmm17, %zmm17
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm29, %zmm23, %zmm19
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm30, %zmm27, %zmm20
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm2, %zmm16, %zmm16
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm16, %zmm31, %zmm13 {%k1}
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm20, %zmm20, %zmm28
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm19, %zmm19, %zmm28
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm17, %zmm17, %zmm28
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm16, %zmm3, %zmm9 {%k1}
 | 
			
		||||
 3      4     2.00                        vrcp14pd	%zmm28, %zmm3
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm16, %zmm18, %zmm5 {%k1}
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm3, %zmm21, %zmm16
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm16, %zmm3, %zmm16
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm16, %zmm3, %zmm16
 | 
			
		||||
 1      4     0.50                        vaddpd	%zmm1, %zmm16, %zmm18
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm3, %zmm22, %zmm3
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm18, %zmm3, %zmm3
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm3, %zmm16, %zmm3
 | 
			
		||||
 1      1     0.50                        shlb	$3, %dl
 | 
			
		||||
 1      1     0.50                        shlb	$7, %cl
 | 
			
		||||
 1      1     0.25                        orb	%dl, %cl
 | 
			
		||||
 1      1     0.25                        addb	$-9, %cl
 | 
			
		||||
 1      1     1.00                        kmovd	%ecx, %k1
 | 
			
		||||
 1      4     1.00                        vcmpltpd	%zmm0, %zmm28, %k1 {%k1}
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm2, %zmm3, %zmm3
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm3, %zmm17, %zmm12 {%k1}
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm3, %zmm19, %zmm8 {%k1}
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm3, %zmm20, %zmm4 {%k1}
 | 
			
		||||
 1      1     0.25                        incq	%rbx
 | 
			
		||||
 1      1     0.25                        cmpq	%rbx, %r9
 | 
			
		||||
 1      1     0.50                        jne	.LBB5_12
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Resources:
 | 
			
		||||
[0]   - SKXDivider
 | 
			
		||||
[1]   - SKXFPDivider
 | 
			
		||||
[2]   - SKXPort0
 | 
			
		||||
[3]   - SKXPort1
 | 
			
		||||
[4]   - SKXPort2
 | 
			
		||||
[5]   - SKXPort3
 | 
			
		||||
[6]   - SKXPort4
 | 
			
		||||
[7]   - SKXPort5
 | 
			
		||||
[8]   - SKXPort6
 | 
			
		||||
[9]   - SKXPort7
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Resource pressure per iteration:
 | 
			
		||||
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    
 | 
			
		||||
 -      -     45.53  20.45  5.50   5.50    -     44.64  18.38   -     
 | 
			
		||||
 | 
			
		||||
Resource pressure by instruction:
 | 
			
		||||
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
 | 
			
		||||
 -      -      -      -     0.50   0.50    -      -      -      -     movslq	(%r10,%rbx,4), %rcx
 | 
			
		||||
 -      -      -     0.99    -      -      -     0.01    -      -     leaq	(%rcx,%rcx,2), %rdx
 | 
			
		||||
 -      -     0.01    -      -      -      -      -     0.99    -     shlq	$6, %rdx
 | 
			
		||||
 -      -     0.01   0.99   0.49   0.51    -      -      -      -     vmovupd	(%rsi,%rdx), %zmm28
 | 
			
		||||
 -      -     0.01   0.91   0.51   0.49    -     0.08    -      -     vmovupd	64(%rsi,%rdx), %zmm29
 | 
			
		||||
 -      -     0.01   0.56   0.49   0.51    -     0.43    -      -     vmovupd	128(%rsi,%rdx), %zmm30
 | 
			
		||||
 -      -      -     0.99   0.50   0.50    -     0.01    -      -     vmovupd	16(%rsp), %zmm3
 | 
			
		||||
 -      -     0.95    -      -      -      -     0.05    -      -     vsubpd	%zmm28, %zmm3, %zmm3
 | 
			
		||||
 -      -     0.48    -      -      -      -     0.52    -      -     vsubpd	%zmm30, %zmm24, %zmm31
 | 
			
		||||
 -      -      -     1.00   0.50   0.50    -      -      -      -     vmovupd	336(%rsp), %zmm16
 | 
			
		||||
 -      -     0.49    -      -      -      -     0.51    -      -     vsubpd	%zmm29, %zmm16, %zmm16
 | 
			
		||||
 -      -     0.48    -      -      -      -     0.52    -      -     vmulpd	%zmm31, %zmm31, %zmm17
 | 
			
		||||
 -      -     0.49    -      -      -      -     0.51    -      -     vfmadd231pd	%zmm16, %zmm16, %zmm17
 | 
			
		||||
 -      -     0.04    -      -      -      -     0.96    -      -     vfmadd231pd	%zmm3, %zmm3, %zmm17
 | 
			
		||||
 -      -     2.00    -      -      -      -     1.00    -      -     vrcp14pd	%zmm17, %zmm18
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulpd	%zmm18, %zmm21, %zmm19
 | 
			
		||||
 -      -     0.51    -      -      -      -     0.49    -      -     vmulpd	%zmm19, %zmm18, %zmm19
 | 
			
		||||
 -      -     0.49    -      -      -      -     0.51    -      -     vmulpd	%zmm19, %zmm18, %zmm19
 | 
			
		||||
 -      -     0.07    -      -      -      -     0.93    -      -     vaddpd	%zmm1, %zmm19, %zmm20
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulpd	%zmm18, %zmm22, %zmm18
 | 
			
		||||
 -      -     0.95    -      -      -      -     0.05    -      -     vmulpd	%zmm20, %zmm18, %zmm18
 | 
			
		||||
 -      -     0.92    -      -      -      -     0.08    -      -     vsubpd	%zmm30, %zmm25, %zmm20
 | 
			
		||||
 -      -      -     0.94    -      -      -     0.06    -      -     leal	(%rcx,%rcx), %edx
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     cmpq	%rdx, %r11
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     setne	%dl
 | 
			
		||||
 -      -     0.44    -      -      -      -      -     0.56    -     sete	%al
 | 
			
		||||
 -      -      -     0.07    -      -      -     0.02   0.91    -     addl	%ecx, %ecx
 | 
			
		||||
 -      -      -     0.53    -      -      -     0.46   0.01    -     incl	%ecx
 | 
			
		||||
 -      -      -     0.51    -      -      -     0.46   0.03    -     cmpq	%rcx, %r11
 | 
			
		||||
 -      -     0.02    -      -      -      -      -     0.98    -     sete	%cl
 | 
			
		||||
 -      -     0.94    -      -      -      -     0.06    -      -     vmulpd	%zmm18, %zmm19, %zmm18
 | 
			
		||||
 -      -     0.01   0.99   0.51   0.49    -      -      -      -     vmovupd	528(%rsp), %zmm19
 | 
			
		||||
 -      -     0.47    -      -      -      -     0.53    -      -     vsubpd	%zmm28, %zmm19, %zmm19
 | 
			
		||||
 -      -     0.04    -      -      -      -      -     0.96    -     setne	%dil
 | 
			
		||||
 -      -      -     0.95    -      -      -     0.02   0.03    -     movl	%edi, %ebp
 | 
			
		||||
 -      -     0.01    -      -      -      -      -     0.99    -     shlb	$4, %bpl
 | 
			
		||||
 -      -      -     0.96    -      -      -      -     0.04    -     subb	%al, %bpl
 | 
			
		||||
 -      -      -     0.06    -      -      -      -     0.94    -     addb	$-17, %bpl
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     kmovd	%ebp, %k1
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vcmpltpd	%zmm0, %zmm17, %k1 {%k1}
 | 
			
		||||
 -      -     0.02   0.97   0.50   0.50    -     0.01    -      -     vmovupd	272(%rsp), %zmm17
 | 
			
		||||
 -      -     0.96    -      -      -      -     0.04    -      -     vsubpd	%zmm29, %zmm17, %zmm17
 | 
			
		||||
 -      -      -     1.00    -      -      -      -      -      -     leal	(%rdx,%rdx), %eax
 | 
			
		||||
 -      -      -     0.05    -      -      -      -     0.95    -     movl	%edi, %ebp
 | 
			
		||||
 -      -     0.51    -      -      -      -     0.49    -      -     vmulpd	%zmm2, %zmm18, %zmm18
 | 
			
		||||
 -      -     0.53    -      -      -      -     0.47    -      -     vfmadd231pd	%zmm18, %zmm3, %zmm14 {%k1}
 | 
			
		||||
 -      -     0.45    -      -      -      -     0.55    -      -     vmulpd	%zmm20, %zmm20, %zmm3
 | 
			
		||||
 -      -     0.94    -      -      -      -     0.06    -      -     vfmadd231pd	%zmm17, %zmm17, %zmm3
 | 
			
		||||
 -      -     0.03    -      -      -      -     0.97    -      -     vfmadd231pd	%zmm19, %zmm19, %zmm3
 | 
			
		||||
 -      -     0.47    -      -      -      -     0.53    -      -     vfmadd231pd	%zmm18, %zmm16, %zmm11 {%k1}
 | 
			
		||||
 -      -     2.00    -      -      -      -     1.00    -      -     vrcp14pd	%zmm3, %zmm16
 | 
			
		||||
 -      -     0.53    -      -      -      -     0.47    -      -     vfmadd231pd	%zmm18, %zmm31, %zmm7 {%k1}
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vmulpd	%zmm16, %zmm21, %zmm18
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vmulpd	%zmm18, %zmm16, %zmm18
 | 
			
		||||
 -      -     0.97    -      -      -      -     0.03    -      -     vmulpd	%zmm18, %zmm16, %zmm18
 | 
			
		||||
 -      -     0.52    -      -      -      -     0.48    -      -     vaddpd	%zmm1, %zmm18, %zmm31
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.99    -      -     vmulpd	%zmm16, %zmm22, %zmm16
 | 
			
		||||
 -      -     0.52    -      -      -      -     0.48    -      -     vmulpd	%zmm31, %zmm16, %zmm16
 | 
			
		||||
 -      -      -     0.99   0.50   0.50    -     0.01    -      -     vmovupd	464(%rsp), %zmm31
 | 
			
		||||
 -      -     0.03    -      -      -      -     0.97    -      -     vsubpd	%zmm28, %zmm31, %zmm31
 | 
			
		||||
 -      -     0.01    -      -      -      -      -     0.99    -     shlb	$5, %bpl
 | 
			
		||||
 -      -      -     0.94    -      -      -      -     0.06    -     orb	%al, %bpl
 | 
			
		||||
 -      -      -     0.04    -      -      -      -     0.96    -     orb	$-35, %bpl
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     kmovd	%ebp, %k1
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vcmpltpd	%zmm0, %zmm3, %k1 {%k1}
 | 
			
		||||
 -      -      -     0.99   0.50   0.50    -     0.01    -      -     vmovupd	208(%rsp), %zmm3
 | 
			
		||||
 -      -     0.95    -      -      -      -     0.05    -      -     vsubpd	%zmm29, %zmm3, %zmm3
 | 
			
		||||
 -      -     0.51    -      -      -      -     0.49    -      -     vmulpd	%zmm16, %zmm18, %zmm16
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.99    -      -     vsubpd	%zmm30, %zmm26, %zmm18
 | 
			
		||||
 -      -     0.52    -      -      -      -     0.48    -      -     vmulpd	%zmm2, %zmm16, %zmm16
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vfmadd231pd	%zmm16, %zmm19, %zmm15 {%k1}
 | 
			
		||||
 -      -     0.03    -      -      -      -     0.97    -      -     vmulpd	%zmm18, %zmm18, %zmm19
 | 
			
		||||
 -      -     0.06    -      -      -      -     0.94    -      -     vfmadd231pd	%zmm3, %zmm3, %zmm19
 | 
			
		||||
 -      -     0.50    -      -      -      -     0.50    -      -     vfmadd231pd	%zmm31, %zmm31, %zmm19
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vfmadd231pd	%zmm16, %zmm17, %zmm10 {%k1}
 | 
			
		||||
 -      -     2.00    -      -      -      -     1.00    -      -     vrcp14pd	%zmm19, %zmm17
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vfmadd231pd	%zmm16, %zmm20, %zmm6 {%k1}
 | 
			
		||||
 -      -     0.07    -      -      -      -     0.93    -      -     vmulpd	%zmm17, %zmm21, %zmm16
 | 
			
		||||
 -      -     0.50    -      -      -      -     0.50    -      -     vmulpd	%zmm16, %zmm17, %zmm16
 | 
			
		||||
 -      -     0.09    -      -      -      -     0.91    -      -     vmulpd	%zmm16, %zmm17, %zmm16
 | 
			
		||||
 -      -     0.07    -      -      -      -     0.93    -      -     vaddpd	%zmm1, %zmm16, %zmm20
 | 
			
		||||
 -      -     0.93    -      -      -      -     0.07    -      -     vmulpd	%zmm17, %zmm22, %zmm17
 | 
			
		||||
 -      -     0.94    -      -      -      -     0.06    -      -     vmulpd	%zmm20, %zmm17, %zmm17
 | 
			
		||||
 -      -     0.51    -      -      -      -     0.49    -      -     vmulpd	%zmm17, %zmm16, %zmm16
 | 
			
		||||
 -      -      -     1.00    -      -      -      -      -      -     leal	(,%rdx,4), %eax
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     shlb	$6, %dil
 | 
			
		||||
 -      -      -     0.02    -      -      -      -     0.98    -     orb	%al, %dil
 | 
			
		||||
 -      -      -     0.48    -      -      -      -     0.52    -     orb	$-69, %dil
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     kmovd	%edi, %k1
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vcmpltpd	%zmm0, %zmm19, %k1 {%k1}
 | 
			
		||||
 -      -      -     1.00   0.50   0.50    -      -      -      -     vmovupd	400(%rsp), %zmm17
 | 
			
		||||
 -      -     0.49    -      -      -      -     0.51    -      -     vsubpd	%zmm28, %zmm17, %zmm17
 | 
			
		||||
 -      -     0.49    -      -      -      -     0.51    -      -     vsubpd	%zmm29, %zmm23, %zmm19
 | 
			
		||||
 -      -     0.02    -      -      -      -     0.98    -      -     vsubpd	%zmm30, %zmm27, %zmm20
 | 
			
		||||
 -      -     0.98    -      -      -      -     0.02    -      -     vmulpd	%zmm2, %zmm16, %zmm16
 | 
			
		||||
 -      -     0.50    -      -      -      -     0.50    -      -     vfmadd231pd	%zmm16, %zmm31, %zmm13 {%k1}
 | 
			
		||||
 -      -     0.94    -      -      -      -     0.06    -      -     vmulpd	%zmm20, %zmm20, %zmm28
 | 
			
		||||
 -      -     0.04    -      -      -      -     0.96    -      -     vfmadd231pd	%zmm19, %zmm19, %zmm28
 | 
			
		||||
 -      -     0.07    -      -      -      -     0.93    -      -     vfmadd231pd	%zmm17, %zmm17, %zmm28
 | 
			
		||||
 -      -     0.50    -      -      -      -     0.50    -      -     vfmadd231pd	%zmm16, %zmm3, %zmm9 {%k1}
 | 
			
		||||
 -      -     2.00    -      -      -      -     1.00    -      -     vrcp14pd	%zmm28, %zmm3
 | 
			
		||||
 -      -     0.50    -      -      -      -     0.50    -      -     vfmadd231pd	%zmm16, %zmm18, %zmm5 {%k1}
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulpd	%zmm3, %zmm21, %zmm16
 | 
			
		||||
 -      -     0.55    -      -      -      -     0.45    -      -     vmulpd	%zmm16, %zmm3, %zmm16
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vmulpd	%zmm16, %zmm3, %zmm16
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vaddpd	%zmm1, %zmm16, %zmm18
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulpd	%zmm3, %zmm22, %zmm3
 | 
			
		||||
 -      -     0.52    -      -      -      -     0.48    -      -     vmulpd	%zmm18, %zmm3, %zmm3
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vmulpd	%zmm3, %zmm16, %zmm3
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     shlb	$3, %dl
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     shlb	$7, %cl
 | 
			
		||||
 -      -      -     1.00    -      -      -      -      -      -     orb	%dl, %cl
 | 
			
		||||
 -      -      -     0.52    -      -      -      -     0.48    -     addb	$-9, %cl
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     kmovd	%ecx, %k1
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vcmpltpd	%zmm0, %zmm28, %k1 {%k1}
 | 
			
		||||
 -      -     0.98    -      -      -      -     0.02    -      -     vmulpd	%zmm2, %zmm3, %zmm3
 | 
			
		||||
 -      -     0.97    -      -      -      -     0.03    -      -     vfmadd231pd	%zmm3, %zmm17, %zmm12 {%k1}
 | 
			
		||||
 -      -     0.03    -      -      -      -     0.97    -      -     vfmadd231pd	%zmm3, %zmm19, %zmm8 {%k1}
 | 
			
		||||
 -      -     0.97    -      -      -      -     0.03    -      -     vfmadd231pd	%zmm3, %zmm20, %zmm4 {%k1}
 | 
			
		||||
 -      -      -     0.48    -      -      -      -     0.52    -     incq	%rbx
 | 
			
		||||
 -      -      -     0.52    -      -      -      -     0.48    -     cmpq	%rbx, %r9
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     jne	.LBB5_12
 | 
			
		||||
@@ -1,167 +0,0 @@
 | 
			
		||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
 | 
			
		||||
Analyzed file:      gromacs-icx-avx512-dp.s
 | 
			
		||||
Architecture:       ICX
 | 
			
		||||
Timestamp:          2023-02-14 12:51:57
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
 | 
			
		||||
 * - Instruction micro-ops not bound to a port
 | 
			
		||||
 X - No throughput/latency information for this instruction in data file
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Combined Analysis Report
 | 
			
		||||
------------------------
 | 
			
		||||
                                                 Port pressure in cycles                                                  
 | 
			
		||||
     |  0   - 0DV  |  1   - 1DV  |  2   -  2D  |  3   -  3D  |  4   |   5   |  6   |  7   |  8   |  9   ||  CP  | LCD  |
 | 
			
		||||
------------------------------------------------------------------------------------------------------------------------
 | 
			
		||||
2241 |             |             |             |             |      |       |      |      |      |      ||      |      |   # pointer_increment=64 da67166e5736661e6b03ea29ee7bfd67
 | 
			
		||||
2242 |             |             |             |             |      |       |      |      |      |      ||      |      |   # LLVM-MCA-BEGIN
 | 
			
		||||
2243 |             |             |             |             |      |       |      |      |      |      ||      |      |   .LBB5_12:                               #   Parent Loop BB5_7 Depth=1
 | 
			
		||||
2244 |             |             |             |             |      |       |      |      |      |      ||      |      |   # =>  This Inner Loop Header: Depth=2
 | 
			
		||||
2245 |             |             | 0.50   0.50 | 0.50   0.50 |      |       |      |      |      |      ||  5.0 |      |   movslq (%r10,%rbx,4), %rcx
 | 
			
		||||
2246 | 0.00        | 0.75        |             |             |      | 0.000 | 0.25 |      |      |      ||  6.0 |      |   leaq (%rcx,%rcx,2), %rdx
 | 
			
		||||
2247 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||  1.0 |      |   shlq $6, %rdx
 | 
			
		||||
2248 |             |             | 0.50   0.50 | 0.50   0.50 |      |       |      |      |      |      ||      |      |   vmovupd (%rsi,%rdx), %zmm28             # AlignMOV convert to UnAlignMOV
 | 
			
		||||
2249 |             |             | 0.50   0.50 | 0.50   0.50 |      |       |      |      |      |      ||      |      |   vmovupd 64(%rsi,%rdx), %zmm29           # AlignMOV convert to UnAlignMOV
 | 
			
		||||
2250 |             |             | 0.50   0.50 | 0.50   0.50 |      |       |      |      |      |      ||  5.0 |      |   vmovupd 128(%rsi,%rdx), %zmm30          # AlignMOV convert to UnAlignMOV
 | 
			
		||||
2251 |             |             | 0.50   0.50 | 0.50   0.50 |      |       |      |      |      |      ||      |      |   vmovupd 16(%rsp), %zmm3                 # 64-byte Reload
 | 
			
		||||
2252 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vsubpd %zmm28, %zmm3, %zmm3
 | 
			
		||||
2253 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vsubpd %zmm30, %zmm24, %zmm31
 | 
			
		||||
2254 |             |             | 0.50   0.50 | 0.50   0.50 |      |       |      |      |      |      ||      |      |   vmovupd 336(%rsp), %zmm16               # 64-byte Reload
 | 
			
		||||
2255 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vsubpd %zmm29, %zmm16, %zmm16
 | 
			
		||||
2256 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vmulpd %zmm31, %zmm31, %zmm17
 | 
			
		||||
2257 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vfmadd231pd %zmm16, %zmm16, %zmm17  # zmm17 = (zmm16 * zmm16) + zmm17
 | 
			
		||||
2258 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vfmadd231pd %zmm3, %zmm3, %zmm17    # zmm17 = (zmm3 * zmm3) + zmm17
 | 
			
		||||
2259 | 2.50        |             |             |             |      | 0.500 |      |      |      |      ||  6.0 |      |   vrcp14pd %zmm17, %zmm18
 | 
			
		||||
2260 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vmulpd %zmm18, %zmm21, %zmm19
 | 
			
		||||
2261 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vmulpd %zmm19, %zmm18, %zmm19
 | 
			
		||||
2262 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vmulpd %zmm19, %zmm18, %zmm19
 | 
			
		||||
2263 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vaddpd %zmm1, %zmm19, %zmm20
 | 
			
		||||
2264 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm18, %zmm22, %zmm18
 | 
			
		||||
2265 | 0.75        |             |             |             |      | 0.250 |      |      |      |      ||  4.0 |      |   vmulpd %zmm20, %zmm18, %zmm18
 | 
			
		||||
2266 | 1.00        |             |             |             |      | 0.000 |      |      |      |      ||      |      |   vsubpd %zmm30, %zmm25, %zmm20
 | 
			
		||||
2267 | 0.00        | 0.50        |             |             |      | 0.000 | 0.50 |      |      |      ||      |      |   leal (%rcx,%rcx), %edx
 | 
			
		||||
2268 | 0.00        | 1.00        |             |             |      | 0.000 | 0.00 |      |      |      ||      |      |   cmpq %rdx, %r11
 | 
			
		||||
2269 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   setne %dl
 | 
			
		||||
2270 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   sete %al
 | 
			
		||||
2271 | 0.00        | 0.50        |             |             |      | 0.000 | 0.50 |      |      |      ||      |      |   addl %ecx, %ecx
 | 
			
		||||
2272 | 0.00        | 0.50        |             |             |      | 0.000 | 0.50 |      |      |      ||      |      |   incl %ecx
 | 
			
		||||
2273 | 0.00        | 1.00        |             |             |      | 0.000 | 0.00 |      |      |      ||      |      |   cmpq %rcx, %r11
 | 
			
		||||
2274 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   sete %cl
 | 
			
		||||
2275 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vmulpd %zmm18, %zmm19, %zmm18
 | 
			
		||||
2276 |             |             | 0.50   0.50 | 0.50   0.50 |      |       |      |      |      |      ||      |      |   vmovupd 528(%rsp), %zmm19               # 64-byte Reload
 | 
			
		||||
2277 | 1.00        |             |             |             |      | 0.000 |      |      |      |      ||      |      |   vsubpd %zmm28, %zmm19, %zmm19
 | 
			
		||||
2278 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   setne %dil
 | 
			
		||||
2279 | 0.00        | 0.75        |             |             |      | 0.000 | 0.25 |      |      |      ||      |      |   movl %edi, %ebp
 | 
			
		||||
2280 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |  1.0 |   shlb $4, %bpl
 | 
			
		||||
2281 | 0.00        | 0.50        |             |             |      | 0.000 | 0.50 |      |      |      ||      |  1.0 |   subb %al, %bpl
 | 
			
		||||
2282 | 0.00        | 0.50        |             |             |      | 0.000 | 0.50 |      |      |      ||      |  1.0 |   addb $-17, %bpl
 | 
			
		||||
2283 | 1.00        |             |             |             |      |       |      |      |      |      ||      |      |   kmovd %ebp, %k1
 | 
			
		||||
2284 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vcmpltpd %zmm0, %zmm17, %k1 {%k1}
 | 
			
		||||
2285 |             |             | 0.50   0.50 | 0.50   0.50 |      |       |      |      |      |      ||      |      |   vmovupd 272(%rsp), %zmm17               # 64-byte Reload
 | 
			
		||||
2286 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vsubpd %zmm29, %zmm17, %zmm17
 | 
			
		||||
2287 | 0.00        | 0.50        |             |             |      | 0.000 | 0.50 |      |      |      ||      |      |   leal (%rdx,%rdx), %eax
 | 
			
		||||
2288 | 0.00        | 0.75        |             |             |      | 0.000 | 0.25 |      |      |      ||      |      |   movl %edi, %ebp
 | 
			
		||||
2289 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vmulpd %zmm2, %zmm18, %zmm18
 | 
			
		||||
2290 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14
 | 
			
		||||
2291 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm20, %zmm20, %zmm3
 | 
			
		||||
2292 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmadd231pd %zmm17, %zmm17, %zmm3   # zmm3 = (zmm17 * zmm17) + zmm3
 | 
			
		||||
2293 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmadd231pd %zmm19, %zmm19, %zmm3   # zmm3 = (zmm19 * zmm19) + zmm3
 | 
			
		||||
2294 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11
 | 
			
		||||
2295 | 2.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vrcp14pd %zmm3, %zmm16
 | 
			
		||||
2296 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7
 | 
			
		||||
2297 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm16, %zmm21, %zmm18
 | 
			
		||||
2298 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm18, %zmm16, %zmm18
 | 
			
		||||
2299 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm18, %zmm16, %zmm18
 | 
			
		||||
2300 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vaddpd %zmm1, %zmm18, %zmm31
 | 
			
		||||
2301 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm16, %zmm22, %zmm16
 | 
			
		||||
2302 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm31, %zmm16, %zmm16
 | 
			
		||||
2303 |             |             | 0.50   0.50 | 0.50   0.50 |      |       |      |      |      |      ||      |      |   vmovupd 464(%rsp), %zmm31               # 64-byte Reload
 | 
			
		||||
2304 | 0.75        |             |             |             |      | 0.250 |      |      |      |      ||      |      |   vsubpd %zmm28, %zmm31, %zmm31
 | 
			
		||||
2305 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |  1.0 |   shlb $5, %bpl
 | 
			
		||||
2306 | 0.00        | 0.50        |             |             |      | 0.000 | 0.50 |      |      |      ||      |  1.0 |   orb %al, %bpl
 | 
			
		||||
2307 | 0.00        | 0.50        |             |             |      | 0.000 | 0.50 |      |      |      ||      |  1.0 |   orb $-35, %bpl
 | 
			
		||||
2308 | 1.00        |             |             |             |      |       |      |      |      |      ||      |      |   kmovd %ebp, %k1
 | 
			
		||||
2309 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vcmpltpd %zmm0, %zmm3, %k1 {%k1}
 | 
			
		||||
2310 |             |             | 0.50   0.50 | 0.50   0.50 |      |       |      |      |      |      ||      |      |   vmovupd 208(%rsp), %zmm3                # 64-byte Reload
 | 
			
		||||
2311 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vsubpd %zmm29, %zmm3, %zmm3
 | 
			
		||||
2312 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm16, %zmm18, %zmm16
 | 
			
		||||
2313 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vsubpd %zmm30, %zmm26, %zmm18
 | 
			
		||||
2314 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm2, %zmm16, %zmm16
 | 
			
		||||
2315 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15
 | 
			
		||||
2316 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm18, %zmm18, %zmm19
 | 
			
		||||
2317 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmadd231pd %zmm3, %zmm3, %zmm19    # zmm19 = (zmm3 * zmm3) + zmm19
 | 
			
		||||
2318 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmadd231pd %zmm31, %zmm31, %zmm19  # zmm19 = (zmm31 * zmm31) + zmm19
 | 
			
		||||
2319 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10
 | 
			
		||||
2320 | 2.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vrcp14pd %zmm19, %zmm17
 | 
			
		||||
2321 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6
 | 
			
		||||
2322 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm17, %zmm21, %zmm16
 | 
			
		||||
2323 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm16, %zmm17, %zmm16
 | 
			
		||||
2324 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm16, %zmm17, %zmm16
 | 
			
		||||
2325 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vaddpd %zmm1, %zmm16, %zmm20
 | 
			
		||||
2326 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm17, %zmm22, %zmm17
 | 
			
		||||
2327 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm20, %zmm17, %zmm17
 | 
			
		||||
2328 | 0.75        |             |             |             |      | 0.250 |      |      |      |      ||      |      |   vmulpd %zmm17, %zmm16, %zmm16
 | 
			
		||||
2329 | 0.00        | 0.75        |             |             |      | 0.000 | 0.25 |      |      |      ||      |      |   leal (,%rdx,4), %eax
 | 
			
		||||
2330 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   shlb $6, %dil
 | 
			
		||||
2331 | 0.00        | 1.00        |             |             |      | 0.000 | 0.00 |      |      |      ||      |      |   orb %al, %dil
 | 
			
		||||
2332 | 0.00        | 1.00        |             |             |      | 0.000 | 0.00 |      |      |      ||      |      |   orb $-69, %dil
 | 
			
		||||
2333 | 1.00        |             |             |             |      |       |      |      |      |      ||      |      |   kmovd %edi, %k1
 | 
			
		||||
2334 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vcmpltpd %zmm0, %zmm19, %k1 {%k1}
 | 
			
		||||
2335 |             |             | 0.50   0.50 | 0.50   0.50 |      |       |      |      |      |      ||      |      |   vmovupd 400(%rsp), %zmm17               # 64-byte Reload
 | 
			
		||||
2336 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vsubpd %zmm28, %zmm17, %zmm17
 | 
			
		||||
2337 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vsubpd %zmm29, %zmm23, %zmm19
 | 
			
		||||
2338 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vsubpd %zmm30, %zmm27, %zmm20
 | 
			
		||||
2339 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm2, %zmm16, %zmm16
 | 
			
		||||
2340 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13
 | 
			
		||||
2341 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd %zmm20, %zmm20, %zmm28
 | 
			
		||||
2342 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmadd231pd %zmm19, %zmm19, %zmm28  # zmm28 = (zmm19 * zmm19) + zmm28
 | 
			
		||||
2343 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vfmadd231pd %zmm17, %zmm17, %zmm28  # zmm28 = (zmm17 * zmm17) + zmm28
 | 
			
		||||
2344 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9
 | 
			
		||||
2345 | 2.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vrcp14pd %zmm28, %zmm3
 | 
			
		||||
2346 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5
 | 
			
		||||
2347 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vmulpd %zmm3, %zmm21, %zmm16
 | 
			
		||||
2348 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vmulpd %zmm16, %zmm3, %zmm16
 | 
			
		||||
2349 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vmulpd %zmm16, %zmm3, %zmm16
 | 
			
		||||
2350 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vaddpd %zmm1, %zmm16, %zmm18
 | 
			
		||||
2351 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vmulpd %zmm3, %zmm22, %zmm3
 | 
			
		||||
2352 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vmulpd %zmm18, %zmm3, %zmm3
 | 
			
		||||
2353 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vmulpd %zmm3, %zmm16, %zmm3
 | 
			
		||||
2354 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   shlb $3, %dl
 | 
			
		||||
2355 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   shlb $7, %cl
 | 
			
		||||
2356 | 0.00        | 1.00        |             |             |      | 0.000 | 0.00 |      |      |      ||      |      |   orb %dl, %cl
 | 
			
		||||
2357 | 0.00        | 1.00        |             |             |      | 0.000 | 0.00 |      |      |      ||      |      |   addb $-9, %cl
 | 
			
		||||
2358 | 1.00        |             |             |             |      |       |      |      |      |      ||      |      |   kmovd %ecx, %k1
 | 
			
		||||
2359 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vcmpltpd %zmm0, %zmm28, %k1 {%k1}
 | 
			
		||||
2360 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vmulpd %zmm2, %zmm3, %zmm3
 | 
			
		||||
2361 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12
 | 
			
		||||
2362 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8
 | 
			
		||||
2363 | 0.24        |             |             |             |      | 0.760 |      |      |      |      ||      |      |   vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4
 | 
			
		||||
2364 | 0.00        | 1.00        |             |             |      | -0.01 | 0.00 |      |      |      ||      |      |   incq %rbx
 | 
			
		||||
2365 | 0.00        | 1.00        |             |             |      | -0.01 | 0.00 |      |      |      ||      |      |   cmpq %rbx, %r9
 | 
			
		||||
2366 |             |             |             |             |      |       |      |      |      |      ||      |      | * jne .LBB5_12
 | 
			
		||||
2367 |             |             |             |             |      |       |      |      |      |      ||      |      |   # LLVM-MCA-END
 | 
			
		||||
 | 
			
		||||
       44.0          15.0          5.50   5.50   5.50   5.50          43.99   15.0                           71    6.0  
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Loop-Carried Dependencies Analysis Report
 | 
			
		||||
-----------------------------------------
 | 
			
		||||
2280 |  6.0 | shlb	$4, %bpl                       | [2280, 2281, 2282, 2305, 2306, 2307]
 | 
			
		||||
2363 |  4.0 | vfmadd231pd	%zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4| [2363]
 | 
			
		||||
2362 |  4.0 | vfmadd231pd	%zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8| [2362]
 | 
			
		||||
2361 |  4.0 | vfmadd231pd	%zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12| [2361]
 | 
			
		||||
2346 |  4.0 | vfmadd231pd	%zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5| [2346]
 | 
			
		||||
2344 |  4.0 | vfmadd231pd	%zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9| [2344]
 | 
			
		||||
2340 |  4.0 | vfmadd231pd	%zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13| [2340]
 | 
			
		||||
2321 |  4.0 | vfmadd231pd	%zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6| [2321]
 | 
			
		||||
2319 |  4.0 | vfmadd231pd	%zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10| [2319]
 | 
			
		||||
2315 |  4.0 | vfmadd231pd	%zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15| [2315]
 | 
			
		||||
2296 |  4.0 | vfmadd231pd	%zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7| [2296]
 | 
			
		||||
2294 |  4.0 | vfmadd231pd	%zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11| [2294]
 | 
			
		||||
2290 |  4.0 | vfmadd231pd	%zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14| [2290]
 | 
			
		||||
2330 |  3.0 | shlb	$6, %dil                       | [2330, 2331, 2332]
 | 
			
		||||
2364 |  1.0 | incq	%rbx                           | [2364]
 | 
			
		||||
 | 
			
		||||
@@ -1,167 +0,0 @@
 | 
			
		||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
 | 
			
		||||
Analyzed file:      gromacs-icx-avx512-dp.s
 | 
			
		||||
Architecture:       CSX
 | 
			
		||||
Timestamp:          2023-02-10 16:30:53
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
 | 
			
		||||
 * - Instruction micro-ops not bound to a port
 | 
			
		||||
 X - No throughput/latency information for this instruction in data file
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Combined Analysis Report
 | 
			
		||||
------------------------
 | 
			
		||||
                                      Port pressure in cycles                                       
 | 
			
		||||
     |  0   - 0DV  |  1   |  2   -  2D  |  3   -  3D  |  4   |  5   |  6   |  7   ||  CP  | LCD  |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
2241 |             |      |             |             |      |      |      |      ||      |      |   # pointer_increment=64 da67166e5736661e6b03ea29ee7bfd67
 | 
			
		||||
2242 |             |      |             |             |      |      |      |      ||      |      |   # LLVM-MCA-BEGIN
 | 
			
		||||
2243 |             |      |             |             |      |      |      |      ||      |      |   .LBB5_12:                               #   Parent Loop BB5_7 Depth=1
 | 
			
		||||
2244 |             |      |             |             |      |      |      |      ||      |      |   # =>  This Inner Loop Header: Depth=2
 | 
			
		||||
2245 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||  4.0 |      |   movslq (%r10,%rbx,4), %rcx
 | 
			
		||||
2246 |             | 1.00 |             |             |      | 0.00 |      |      ||  1.0 |      |   leaq (%rcx,%rcx,2), %rdx
 | 
			
		||||
2247 | 0.00        |      |             |             |      |      | 1.00 |      ||  1.0 |      |   shlq $6, %rdx
 | 
			
		||||
2248 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovupd (%rsi,%rdx), %zmm28             # AlignMOV convert to UnAlignMOV
 | 
			
		||||
2249 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovupd 64(%rsi,%rdx), %zmm29           # AlignMOV convert to UnAlignMOV
 | 
			
		||||
2250 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||  4.0 |      |   vmovupd 128(%rsi,%rdx), %zmm30          # AlignMOV convert to UnAlignMOV
 | 
			
		||||
2251 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovupd 16(%rsp), %zmm3                 # 64-byte Reload
 | 
			
		||||
2252 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd %zmm28, %zmm3, %zmm3
 | 
			
		||||
2253 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vsubpd %zmm30, %zmm24, %zmm31
 | 
			
		||||
2254 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovupd 336(%rsp), %zmm16               # 64-byte Reload
 | 
			
		||||
2255 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd %zmm29, %zmm16, %zmm16
 | 
			
		||||
2256 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd %zmm31, %zmm31, %zmm17
 | 
			
		||||
2257 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vfmadd231pd %zmm16, %zmm16, %zmm17  # zmm17 = (zmm16 * zmm16) + zmm17
 | 
			
		||||
2258 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vfmadd231pd %zmm3, %zmm3, %zmm17    # zmm17 = (zmm3 * zmm3) + zmm17
 | 
			
		||||
2259 | 2.50        |      |             |             |      | 0.50 |      |      ||  8.0 |      |   vrcp14pd %zmm17, %zmm18
 | 
			
		||||
2260 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd %zmm18, %zmm21, %zmm19
 | 
			
		||||
2261 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd %zmm19, %zmm18, %zmm19
 | 
			
		||||
2262 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd %zmm19, %zmm18, %zmm19
 | 
			
		||||
2263 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vaddpd %zmm1, %zmm19, %zmm20
 | 
			
		||||
2264 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm18, %zmm22, %zmm18
 | 
			
		||||
2265 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd %zmm20, %zmm18, %zmm18
 | 
			
		||||
2266 | 1.00        |      |             |             |      | 0.00 |      |      ||      |      |   vsubpd %zmm30, %zmm25, %zmm20
 | 
			
		||||
2267 |             | 1.00 |             |             |      | 0.00 |      |      ||      |      |   leal (%rcx,%rcx), %edx
 | 
			
		||||
2268 | 0.00        | 1.00 |             |             |      | 0.00 | 0.00 |      ||      |      |   cmpq %rdx, %r11
 | 
			
		||||
2269 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   setne %dl
 | 
			
		||||
2270 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   sete %al
 | 
			
		||||
2271 | 0.00        | 0.50 |             |             |      | 0.00 | 0.50 |      ||      |      |   addl %ecx, %ecx
 | 
			
		||||
2272 | 0.00        | 0.50 |             |             |      | 0.00 | 0.50 |      ||      |      |   incl %ecx
 | 
			
		||||
2273 | 0.00        | 1.00 |             |             |      | 0.00 | 0.00 |      ||      |      |   cmpq %rcx, %r11
 | 
			
		||||
2274 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   sete %cl
 | 
			
		||||
2275 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd %zmm18, %zmm19, %zmm18
 | 
			
		||||
2276 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovupd 528(%rsp), %zmm19               # 64-byte Reload
 | 
			
		||||
2277 | 1.00        |      |             |             |      | 0.00 |      |      ||      |      |   vsubpd %zmm28, %zmm19, %zmm19
 | 
			
		||||
2278 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   setne %dil
 | 
			
		||||
2279 | 0.00        | 0.75 |             |             |      | 0.00 | 0.25 |      ||      |      |   movl %edi, %ebp
 | 
			
		||||
2280 | 0.00        |      |             |             |      |      | 1.00 |      ||      |  1.0 |   shlb $4, %bpl
 | 
			
		||||
2281 | 0.00        | 0.50 |             |             |      | 0.00 | 0.50 |      ||      |  1.0 |   subb %al, %bpl
 | 
			
		||||
2282 | 0.00        | 0.25 |             |             |      | 0.00 | 0.75 |      ||      |  1.0 |   addb $-17, %bpl
 | 
			
		||||
2283 | 1.00        |      |             |             |      |      |      |      ||      |      |   kmovd %ebp, %k1
 | 
			
		||||
2284 |             |      |             |             |      | 1.00 |      |      ||      |      |   vcmpltpd %zmm0, %zmm17, %k1 {%k1}
 | 
			
		||||
2285 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovupd 272(%rsp), %zmm17               # 64-byte Reload
 | 
			
		||||
2286 | 0.25        |      |             |             |      | 0.75 |      |      ||      |      |   vsubpd %zmm29, %zmm17, %zmm17
 | 
			
		||||
2287 |             | 1.00 |             |             |      | 0.00 |      |      ||      |      |   leal (%rdx,%rdx), %eax
 | 
			
		||||
2288 | 0.00        | 0.75 |             |             |      | 0.00 | 0.25 |      ||      |      |   movl %edi, %ebp
 | 
			
		||||
2289 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd %zmm2, %zmm18, %zmm18
 | 
			
		||||
2290 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vfmadd231pd %zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14
 | 
			
		||||
2291 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm20, %zmm20, %zmm3
 | 
			
		||||
2292 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm17, %zmm17, %zmm3   # zmm3 = (zmm17 * zmm17) + zmm3
 | 
			
		||||
2293 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm19, %zmm19, %zmm3   # zmm3 = (zmm19 * zmm19) + zmm3
 | 
			
		||||
2294 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11
 | 
			
		||||
2295 | 2.50        |      |             |             |      | 0.50 |      |      ||      |      |   vrcp14pd %zmm3, %zmm16
 | 
			
		||||
2296 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7
 | 
			
		||||
2297 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm16, %zmm21, %zmm18
 | 
			
		||||
2298 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm18, %zmm16, %zmm18
 | 
			
		||||
2299 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm18, %zmm16, %zmm18
 | 
			
		||||
2300 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vaddpd %zmm1, %zmm18, %zmm31
 | 
			
		||||
2301 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm16, %zmm22, %zmm16
 | 
			
		||||
2302 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm31, %zmm16, %zmm16
 | 
			
		||||
2303 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovupd 464(%rsp), %zmm31               # 64-byte Reload
 | 
			
		||||
2304 | 0.75        |      |             |             |      | 0.25 |      |      ||      |      |   vsubpd %zmm28, %zmm31, %zmm31
 | 
			
		||||
2305 | 0.00        |      |             |             |      |      | 1.00 |      ||      |  1.0 |   shlb $5, %bpl
 | 
			
		||||
2306 | 0.00        | 0.50 |             |             |      | 0.00 | 0.50 |      ||      |  1.0 |   orb %al, %bpl
 | 
			
		||||
2307 | 0.00        | 0.50 |             |             |      | 0.00 | 0.50 |      ||      |  1.0 |   orb $-35, %bpl
 | 
			
		||||
2308 | 1.00        |      |             |             |      |      |      |      ||      |      |   kmovd %ebp, %k1
 | 
			
		||||
2309 |             |      |             |             |      | 1.00 |      |      ||      |      |   vcmpltpd %zmm0, %zmm3, %k1 {%k1}
 | 
			
		||||
2310 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovupd 208(%rsp), %zmm3                # 64-byte Reload
 | 
			
		||||
2311 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd %zmm29, %zmm3, %zmm3
 | 
			
		||||
2312 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm16, %zmm18, %zmm16
 | 
			
		||||
2313 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd %zmm30, %zmm26, %zmm18
 | 
			
		||||
2314 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm2, %zmm16, %zmm16
 | 
			
		||||
2315 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15
 | 
			
		||||
2316 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm18, %zmm18, %zmm19
 | 
			
		||||
2317 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm3, %zmm3, %zmm19    # zmm19 = (zmm3 * zmm3) + zmm19
 | 
			
		||||
2318 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm31, %zmm31, %zmm19  # zmm19 = (zmm31 * zmm31) + zmm19
 | 
			
		||||
2319 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10
 | 
			
		||||
2320 | 2.50        |      |             |             |      | 0.50 |      |      ||      |      |   vrcp14pd %zmm19, %zmm17
 | 
			
		||||
2321 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6
 | 
			
		||||
2322 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm17, %zmm21, %zmm16
 | 
			
		||||
2323 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm16, %zmm17, %zmm16
 | 
			
		||||
2324 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm16, %zmm17, %zmm16
 | 
			
		||||
2325 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vaddpd %zmm1, %zmm16, %zmm20
 | 
			
		||||
2326 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm17, %zmm22, %zmm17
 | 
			
		||||
2327 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm20, %zmm17, %zmm17
 | 
			
		||||
2328 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm17, %zmm16, %zmm16
 | 
			
		||||
2329 |             | 1.00 |             |             |      | 0.00 |      |      ||      |      |   leal (,%rdx,4), %eax
 | 
			
		||||
2330 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   shlb $6, %dil
 | 
			
		||||
2331 | 0.00        | 0.50 |             |             |      | 0.00 | 0.50 |      ||      |      |   orb %al, %dil
 | 
			
		||||
2332 | 0.00        | 1.00 |             |             |      | 0.00 | 0.00 |      ||      |      |   orb $-69, %dil
 | 
			
		||||
2333 | 1.00        |      |             |             |      |      |      |      ||      |      |   kmovd %edi, %k1
 | 
			
		||||
2334 |             |      |             |             |      | 1.00 |      |      ||      |      |   vcmpltpd %zmm0, %zmm19, %k1 {%k1}
 | 
			
		||||
2335 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovupd 400(%rsp), %zmm17               # 64-byte Reload
 | 
			
		||||
2336 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd %zmm28, %zmm17, %zmm17
 | 
			
		||||
2337 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd %zmm29, %zmm23, %zmm19
 | 
			
		||||
2338 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd %zmm30, %zmm27, %zmm20
 | 
			
		||||
2339 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm2, %zmm16, %zmm16
 | 
			
		||||
2340 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13
 | 
			
		||||
2341 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm20, %zmm20, %zmm28
 | 
			
		||||
2342 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm19, %zmm19, %zmm28  # zmm28 = (zmm19 * zmm19) + zmm28
 | 
			
		||||
2343 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm17, %zmm17, %zmm28  # zmm28 = (zmm17 * zmm17) + zmm28
 | 
			
		||||
2344 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9
 | 
			
		||||
2345 | 2.50        |      |             |             |      | 0.50 |      |      ||      |      |   vrcp14pd %zmm28, %zmm3
 | 
			
		||||
2346 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231pd %zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5
 | 
			
		||||
2347 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd %zmm3, %zmm21, %zmm16
 | 
			
		||||
2348 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vmulpd %zmm16, %zmm3, %zmm16
 | 
			
		||||
2349 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vmulpd %zmm16, %zmm3, %zmm16
 | 
			
		||||
2350 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vaddpd %zmm1, %zmm16, %zmm18
 | 
			
		||||
2351 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vmulpd %zmm3, %zmm22, %zmm3
 | 
			
		||||
2352 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vmulpd %zmm18, %zmm3, %zmm3
 | 
			
		||||
2353 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vmulpd %zmm3, %zmm16, %zmm3
 | 
			
		||||
2354 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   shlb $3, %dl
 | 
			
		||||
2355 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   shlb $7, %cl
 | 
			
		||||
2356 | 0.00        | 0.25 |             |             |      | 0.00 | 0.75 |      ||      |      |   orb %dl, %cl
 | 
			
		||||
2357 | 0.00        | 1.00 |             |             |      | 0.00 | 0.00 |      ||      |      |   addb $-9, %cl
 | 
			
		||||
2358 | 1.00        |      |             |             |      |      |      |      ||      |      |   kmovd %ecx, %k1
 | 
			
		||||
2359 |             |      |             |             |      | 1.00 |      |      ||      |      |   vcmpltpd %zmm0, %zmm28, %k1 {%k1}
 | 
			
		||||
2360 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vmulpd %zmm2, %zmm3, %zmm3
 | 
			
		||||
2361 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vfmadd231pd %zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12
 | 
			
		||||
2362 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vfmadd231pd %zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8
 | 
			
		||||
2363 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vfmadd231pd %zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4
 | 
			
		||||
2364 | 0.00        | 1.00 |             |             |      | 0.00 | 0.00 |      ||      |      |   incq %rbx
 | 
			
		||||
2365 | 0.00        | 1.00 |             |             |      | 0.00 | 0.00 |      ||      |      |   cmpq %rbx, %r9
 | 
			
		||||
2366 |             |      |             |             |      |      |      |      ||      |      | * jne .LBB5_12
 | 
			
		||||
2367 |             |      |             |             |      |      |      |      ||      |      |   # LLVM-MCA-END
 | 
			
		||||
 | 
			
		||||
       44.0          15.0   5.50   5.50   5.50   5.50          44.0   15.0           66.0    6.0  
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Loop-Carried Dependencies Analysis Report
 | 
			
		||||
-----------------------------------------
 | 
			
		||||
2280 |  6.0 | shlb	$4, %bpl                       | [2280, 2281, 2282, 2305, 2306, 2307]
 | 
			
		||||
2363 |  4.0 | vfmadd231pd	%zmm3, %zmm20, %zmm4 {%k1} # zmm4 {%k1} = (zmm20 * zmm3) + zmm4| [2363]
 | 
			
		||||
2362 |  4.0 | vfmadd231pd	%zmm3, %zmm19, %zmm8 {%k1} # zmm8 {%k1} = (zmm19 * zmm3) + zmm8| [2362]
 | 
			
		||||
2361 |  4.0 | vfmadd231pd	%zmm3, %zmm17, %zmm12 {%k1} # zmm12 {%k1} = (zmm17 * zmm3) + zmm12| [2361]
 | 
			
		||||
2346 |  4.0 | vfmadd231pd	%zmm16, %zmm18, %zmm5 {%k1} # zmm5 {%k1} = (zmm18 * zmm16) + zmm5| [2346]
 | 
			
		||||
2344 |  4.0 | vfmadd231pd	%zmm16, %zmm3, %zmm9 {%k1} # zmm9 {%k1} = (zmm3 * zmm16) + zmm9| [2344]
 | 
			
		||||
2340 |  4.0 | vfmadd231pd	%zmm16, %zmm31, %zmm13 {%k1} # zmm13 {%k1} = (zmm31 * zmm16) + zmm13| [2340]
 | 
			
		||||
2321 |  4.0 | vfmadd231pd	%zmm16, %zmm20, %zmm6 {%k1} # zmm6 {%k1} = (zmm20 * zmm16) + zmm6| [2321]
 | 
			
		||||
2319 |  4.0 | vfmadd231pd	%zmm16, %zmm17, %zmm10 {%k1} # zmm10 {%k1} = (zmm17 * zmm16) + zmm10| [2319]
 | 
			
		||||
2315 |  4.0 | vfmadd231pd	%zmm16, %zmm19, %zmm15 {%k1} # zmm15 {%k1} = (zmm19 * zmm16) + zmm15| [2315]
 | 
			
		||||
2296 |  4.0 | vfmadd231pd	%zmm18, %zmm31, %zmm7 {%k1} # zmm7 {%k1} = (zmm31 * zmm18) + zmm7| [2296]
 | 
			
		||||
2294 |  4.0 | vfmadd231pd	%zmm18, %zmm16, %zmm11 {%k1} # zmm11 {%k1} = (zmm16 * zmm18) + zmm11| [2294]
 | 
			
		||||
2290 |  4.0 | vfmadd231pd	%zmm18, %zmm3, %zmm14 {%k1} # zmm14 {%k1} = (zmm3 * zmm18) + zmm14| [2290]
 | 
			
		||||
2330 |  3.0 | shlb	$6, %dil                       | [2330, 2331, 2332]
 | 
			
		||||
2364 |  1.0 | incq	%rbx                           | [2364]
 | 
			
		||||
 | 
			
		||||
@@ -1,162 +0,0 @@
 | 
			
		||||
Intel(R) Architecture Code Analyzer Version -  v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
 | 
			
		||||
Analyzed File -  gromacs-icx-avx512-sp.o
 | 
			
		||||
Binary Format - 64Bit
 | 
			
		||||
Architecture  -  SKX
 | 
			
		||||
Analysis Type - Throughput
 | 
			
		||||
 | 
			
		||||
Throughput Analysis Report
 | 
			
		||||
--------------------------
 | 
			
		||||
Block Throughput: 64.00 Cycles       Throughput Bottleneck: Backend
 | 
			
		||||
Loop Count:  22
 | 
			
		||||
Port Binding In Cycles Per Iteration:
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
|  Port  |   0   -  DV   |   1   |   2   -  D    |   3   -  D    |   4   |   5   |   6   |   7   |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
| Cycles | 50.0     0.0  |  7.0  |  9.5     8.1  |  9.5     7.9  |  3.0  | 50.0  |  7.0  |  0.0  |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
DV - Divider pipe (on port 0)
 | 
			
		||||
D - Data fetch pipe (on ports 2 and 3)
 | 
			
		||||
F - Macro Fusion with the previous instruction occurred
 | 
			
		||||
* - instruction micro-ops not bound to a port
 | 
			
		||||
^ - Micro Fusion occurred
 | 
			
		||||
# - ESP Tracking sync uop was issued
 | 
			
		||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
 | 
			
		||||
X - instruction not supported, was not accounted in Analysis
 | 
			
		||||
 | 
			
		||||
| Num Of   |                    Ports pressure in cycles                         |      |
 | 
			
		||||
|  Uops    |  0  - DV    |  1   |  2  -  D    |  3  -  D    |  4   |  5   |  6   |  7   |
 | 
			
		||||
-----------------------------------------------------------------------------------------
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | movsxd rax, dword ptr [r11+rdx*4]
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | mov rsi, rax
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shl rsi, 0x5
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea rbx, ptr [rsi+rsi*2]
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovups zmm15, zmmword ptr [rdi+rbx*1]
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovups zmm16, zmmword ptr [rdi+rbx*1+0x20]
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovups zmm27, zmmword ptr [rdi+rbx*1+0x40]
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovups zmm1, zmmword ptr [rsp+0x80]
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubps zmm24, zmm1, zmm15
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovups zmm1, zmmword ptr [rsp+0x140]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubps zmm25, zmm1, zmm16
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubps zmm26, zmm9, zmm27
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovups zmm1, zmmword ptr [rsp]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubps zmm21, zmm1, zmm15
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovups zmm1, zmmword ptr [rsp+0x100]
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubps zmm22, zmm1, zmm16
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubps zmm23, zmm10, zmm27
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovups zmm1, zmmword ptr [rsp+0x1c0]
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubps zmm17, zmm1, zmm15
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovups zmm1, zmmword ptr [rsp+0xc0]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubps zmm19, zmm1, zmm16
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubps zmm20, zmm11, zmm27
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovups zmm1, zmmword ptr [rsp+0x180]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubps zmm18, zmm1, zmm15
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubps zmm16, zmm8, zmm16
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubps zmm15, zmm12, zmm27
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm27, zmm26, zmm26
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231ps zmm27, zmm25, zmm25
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231ps zmm27, zmm24, zmm24
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm28, zmm23, zmm23
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231ps zmm28, zmm22, zmm22
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231ps zmm28, zmm21, zmm21
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm29, zmm20, zmm20
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vfmadd231ps zmm29, zmm19, zmm19
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231ps zmm29, zmm17, zmm17
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm30, zmm15, zmm15
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231ps zmm30, zmm16, zmm16
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14ps zmm31, zmm27
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14ps zmm1, zmm28
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14ps zmm2, zmm29
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfmadd231ps zmm30, zmm18, zmm18
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14ps zmm3, zmm30
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm4, zmm6, zmm31
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm4, zmm31, zmm4
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm4, zmm31, zmm4
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vaddps zmm5, zmm4, zmm13
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm31, zmm7, zmm31
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm5, zmm31, zmm5
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm31, zmm6, zmm1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm31, zmm1, zmm31
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm31, zmm1, zmm31
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm4, zmm4, zmm5
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vaddps zmm5, zmm31, zmm13
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm1, zmm7, zmm1
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm1, zmm1, zmm5
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm5, zmm6, zmm2
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm5, zmm2, zmm5
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm5, zmm2, zmm5
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm1, zmm31, zmm1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vaddps zmm31, zmm5, zmm13
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm2, zmm7, zmm2
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm2, zmm2, zmm31
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm31, zmm6, zmm3
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm31, zmm3, zmm31
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm31, zmm3, zmm31
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm2, zmm5, zmm2
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vaddps zmm5, zmm31, zmm13
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm3, zmm7, zmm3
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm3, zmm3, zmm5
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm3, zmm31, zmm3
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | xor esi, esi
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | xor edi, edi
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | test eax, 0x7fffffff
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | setz sil
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | setnz dil
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | mov eax, 0xff
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | cmovz eax, r8d
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | mov ecx, 0xff
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | cmovz ecx, r9d
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | xor esi, 0xff
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovd k1, esi
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmpps k1{k1}, zmm27, zmm0, 0x1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm4, zmm4, zmm14
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm5{k1}{z}, zmm24, zmm4
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm24{k1}{z}, zmm25, zmm4
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm4{k1}{z}, zmm26, zmm4
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | lea esi, ptr [rdi+rdi*2]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | or esi, 0xfc
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovd k1, esi
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmpps k1{k1}, zmm28, zmm0, 0x1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm1, zmm1, zmm14
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm21{k1}{z}, zmm21, zmm1
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vaddps zmm5, zmm5, zmm21
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm21{k1}{z}, zmm22, zmm1
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vaddps zmm21, zmm24, zmm21
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm1{k1}{z}, zmm23, zmm1
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vaddps zmm1, zmm4, zmm1
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovd k1, eax
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmpps k1{k1}, zmm29, zmm0, 0x1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm2, zmm2, zmm14
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm4{k1}{z}, zmm17, zmm2
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm17{k1}{z}, zmm19, zmm2
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm2{k1}{z}, zmm20, zmm2
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | kmovd k1, ecx
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmpps k1{k1}, zmm30, zmm0, 0x1
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm3, zmm3, zmm14
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm18{k1}{z}, zmm18, zmm3
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vaddps zmm4, zmm4, zmm18
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vaddps zmm4, zmm5, zmm4
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmulps zmm5{k1}{z}, zmm16, zmm3
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vaddps zmm5, zmm17, zmm5
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vaddps zmm5, zmm21, zmm5
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulps zmm3{k1}{z}, zmm15, zmm3
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | mov rax, qword ptr [r15+0xb0]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vaddps zmm2, zmm2, zmm3
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovups zmm3, zmmword ptr [rax+rbx*1]
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubps zmm3, zmm3, zmm4
 | 
			
		||||
|   2      |             |      | 0.5         | 0.5         | 1.0  |      |      |      | vmovups zmmword ptr [rax+rbx*1], zmm3
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vaddps zmm1, zmm1, zmm2
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovups zmm2, zmmword ptr [rax+rbx*1+0x20]
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vsubps zmm2, zmm2, zmm5
 | 
			
		||||
|   2      |             |      | 0.5         | 0.5         | 1.0  |      |      |      | vmovups zmmword ptr [rax+rbx*1+0x20], zmm2
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovups zmm2, zmmword ptr [rax+rbx*1+0x40]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vsubps zmm1, zmm2, zmm1
 | 
			
		||||
|   2      |             |      | 0.5         | 0.5         | 1.0  |      |      |      | vmovups zmmword ptr [rax+rbx*1+0x40], zmm1
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | cmp r10, rdx
 | 
			
		||||
|   0*F    |             |      |             |             |      |      |      |      | jz 0x34
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | mov rdi, qword ptr [r15+0xa0]
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | inc rdx
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | jmp 0xfffffffffffffcfc
 | 
			
		||||
Total Num Of Uops: 140
 | 
			
		||||
Analysis Notes:
 | 
			
		||||
Backend allocation was stalled due to unavailable allocation resources.
 | 
			
		||||
@@ -1,304 +0,0 @@
 | 
			
		||||
 | 
			
		||||
[0] Code Region
 | 
			
		||||
 | 
			
		||||
Iterations:        100
 | 
			
		||||
Instructions:      13000
 | 
			
		||||
Total Cycles:      5640
 | 
			
		||||
Total uOps:        15400
 | 
			
		||||
 | 
			
		||||
Dispatch Width:    6
 | 
			
		||||
uOps Per Cycle:    2.73
 | 
			
		||||
IPC:               2.30
 | 
			
		||||
Block RThroughput: 40.0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Instruction Info:
 | 
			
		||||
[1]: #uOps
 | 
			
		||||
[2]: Latency
 | 
			
		||||
[3]: RThroughput
 | 
			
		||||
[4]: MayLoad
 | 
			
		||||
[5]: MayStore
 | 
			
		||||
[6]: HasSideEffects (U)
 | 
			
		||||
 | 
			
		||||
[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 | 
			
		||||
 1      5     0.50    *                   movslq	(%r11,%rdx,4), %rax
 | 
			
		||||
 1      1     0.25                        movq	%rax, %rsi
 | 
			
		||||
 1      1     0.50                        shlq	$5, %rsi
 | 
			
		||||
 1      1     0.50                        leaq	(%rsi,%rsi,2), %rbx
 | 
			
		||||
 2      8     0.50    *                   vmovups	(%rdi,%rbx), %zmm15
 | 
			
		||||
 2      8     0.50    *                   vmovups	32(%rdi,%rbx), %zmm16
 | 
			
		||||
 2      8     0.50    *                   vmovups	64(%rdi,%rbx), %zmm27
 | 
			
		||||
 2      8     0.50    *                   vmovups	128(%rsp), %zmm1
 | 
			
		||||
 1      4     0.50                        vsubps	%zmm15, %zmm1, %zmm24
 | 
			
		||||
 2      8     0.50    *                   vmovups	320(%rsp), %zmm1
 | 
			
		||||
 1      4     0.50                        vsubps	%zmm16, %zmm1, %zmm25
 | 
			
		||||
 1      4     0.50                        vsubps	%zmm27, %zmm9, %zmm26
 | 
			
		||||
 2      8     0.50    *                   vmovups	(%rsp), %zmm1
 | 
			
		||||
 1      4     0.50                        vsubps	%zmm15, %zmm1, %zmm21
 | 
			
		||||
 2      8     0.50    *                   vmovups	256(%rsp), %zmm1
 | 
			
		||||
 1      4     0.50                        vsubps	%zmm16, %zmm1, %zmm22
 | 
			
		||||
 1      4     0.50                        vsubps	%zmm27, %zmm10, %zmm23
 | 
			
		||||
 2      8     0.50    *                   vmovups	448(%rsp), %zmm1
 | 
			
		||||
 1      4     0.50                        vsubps	%zmm15, %zmm1, %zmm17
 | 
			
		||||
 2      8     0.50    *                   vmovups	192(%rsp), %zmm1
 | 
			
		||||
 1      4     0.50                        vsubps	%zmm16, %zmm1, %zmm19
 | 
			
		||||
 1      4     0.50                        vsubps	%zmm27, %zmm11, %zmm20
 | 
			
		||||
 2      8     0.50    *                   vmovups	384(%rsp), %zmm1
 | 
			
		||||
 1      4     0.50                        vsubps	%zmm15, %zmm1, %zmm18
 | 
			
		||||
 1      4     0.50                        vsubps	%zmm16, %zmm8, %zmm16
 | 
			
		||||
 1      4     0.50                        vsubps	%zmm27, %zmm12, %zmm15
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm26, %zmm26, %zmm27
 | 
			
		||||
 1      4     0.50                        vfmadd231ps	%zmm25, %zmm25, %zmm27
 | 
			
		||||
 1      4     0.50                        vfmadd231ps	%zmm24, %zmm24, %zmm27
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm23, %zmm23, %zmm28
 | 
			
		||||
 1      4     0.50                        vfmadd231ps	%zmm22, %zmm22, %zmm28
 | 
			
		||||
 1      4     0.50                        vfmadd231ps	%zmm21, %zmm21, %zmm28
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm20, %zmm20, %zmm29
 | 
			
		||||
 1      4     0.50                        vfmadd231ps	%zmm19, %zmm19, %zmm29
 | 
			
		||||
 1      4     0.50                        vfmadd231ps	%zmm17, %zmm17, %zmm29
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm15, %zmm15, %zmm30
 | 
			
		||||
 1      4     0.50                        vfmadd231ps	%zmm16, %zmm16, %zmm30
 | 
			
		||||
 3      4     2.00                        vrcp14ps	%zmm27, %zmm31
 | 
			
		||||
 3      4     2.00                        vrcp14ps	%zmm28, %zmm1
 | 
			
		||||
 3      4     2.00                        vrcp14ps	%zmm29, %zmm2
 | 
			
		||||
 1      4     0.50                        vfmadd231ps	%zmm18, %zmm18, %zmm30
 | 
			
		||||
 3      4     2.00                        vrcp14ps	%zmm30, %zmm3
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm31, %zmm6, %zmm4
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm4, %zmm31, %zmm4
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm4, %zmm31, %zmm4
 | 
			
		||||
 1      4     0.50                        vaddps	%zmm13, %zmm4, %zmm5
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm31, %zmm7, %zmm31
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm5, %zmm31, %zmm5
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm1, %zmm6, %zmm31
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm31, %zmm1, %zmm31
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm31, %zmm1, %zmm31
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm5, %zmm4, %zmm4
 | 
			
		||||
 1      4     0.50                        vaddps	%zmm13, %zmm31, %zmm5
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm1, %zmm7, %zmm1
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm5, %zmm1, %zmm1
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm2, %zmm6, %zmm5
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm5, %zmm2, %zmm5
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm5, %zmm2, %zmm5
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm1, %zmm31, %zmm1
 | 
			
		||||
 1      4     0.50                        vaddps	%zmm13, %zmm5, %zmm31
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm2, %zmm7, %zmm2
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm31, %zmm2, %zmm2
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm3, %zmm6, %zmm31
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm31, %zmm3, %zmm31
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm31, %zmm3, %zmm31
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm2, %zmm5, %zmm2
 | 
			
		||||
 1      4     0.50                        vaddps	%zmm13, %zmm31, %zmm5
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm3, %zmm7, %zmm3
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm5, %zmm3, %zmm3
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm3, %zmm31, %zmm3
 | 
			
		||||
 1      0     0.17                        xorl	%esi, %esi
 | 
			
		||||
 1      0     0.17                        xorl	%edi, %edi
 | 
			
		||||
 1      1     0.25                        testl	$2147483647, %eax
 | 
			
		||||
 1      1     0.50                        sete	%sil
 | 
			
		||||
 1      1     0.50                        setne	%dil
 | 
			
		||||
 1      1     0.25                        movl	$255, %eax
 | 
			
		||||
 1      1     0.50                        cmovel	%r8d, %eax
 | 
			
		||||
 1      1     0.25                        movl	$255, %ecx
 | 
			
		||||
 1      1     0.50                        cmovel	%r9d, %ecx
 | 
			
		||||
 1      1     0.25                        xorl	$255, %esi
 | 
			
		||||
 1      1     1.00                        kmovd	%esi, %k1
 | 
			
		||||
 1      4     1.00                        vcmpltps	%zmm0, %zmm27, %k1 {%k1}
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm14, %zmm4, %zmm4
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm4, %zmm24, %zmm5 {%k1} {z}
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm4, %zmm25, %zmm24 {%k1} {z}
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm4, %zmm26, %zmm4 {%k1} {z}
 | 
			
		||||
 1      1     0.50                        leal	(%rdi,%rdi,2), %esi
 | 
			
		||||
 1      1     0.25                        orl	$252, %esi
 | 
			
		||||
 1      1     1.00                        kmovd	%esi, %k1
 | 
			
		||||
 1      4     1.00                        vcmpltps	%zmm0, %zmm28, %k1 {%k1}
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm14, %zmm1, %zmm1
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm1, %zmm21, %zmm21 {%k1} {z}
 | 
			
		||||
 1      4     0.50                        vaddps	%zmm21, %zmm5, %zmm5
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm1, %zmm22, %zmm21 {%k1} {z}
 | 
			
		||||
 1      4     0.50                        vaddps	%zmm21, %zmm24, %zmm21
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm1, %zmm23, %zmm1 {%k1} {z}
 | 
			
		||||
 1      4     0.50                        vaddps	%zmm1, %zmm4, %zmm1
 | 
			
		||||
 1      1     1.00                        kmovd	%eax, %k1
 | 
			
		||||
 1      4     1.00                        vcmpltps	%zmm0, %zmm29, %k1 {%k1}
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm14, %zmm2, %zmm2
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm2, %zmm17, %zmm4 {%k1} {z}
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm2, %zmm19, %zmm17 {%k1} {z}
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm2, %zmm20, %zmm2 {%k1} {z}
 | 
			
		||||
 1      1     1.00                        kmovd	%ecx, %k1
 | 
			
		||||
 1      4     1.00                        vcmpltps	%zmm0, %zmm30, %k1 {%k1}
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm14, %zmm3, %zmm3
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm3, %zmm18, %zmm18 {%k1} {z}
 | 
			
		||||
 1      4     0.50                        vaddps	%zmm18, %zmm4, %zmm4
 | 
			
		||||
 1      4     0.50                        vaddps	%zmm4, %zmm5, %zmm4
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm3, %zmm16, %zmm5 {%k1} {z}
 | 
			
		||||
 1      4     0.50                        vaddps	%zmm5, %zmm17, %zmm5
 | 
			
		||||
 1      4     0.50                        vaddps	%zmm5, %zmm21, %zmm5
 | 
			
		||||
 1      4     0.50                        vmulps	%zmm3, %zmm15, %zmm3 {%k1} {z}
 | 
			
		||||
 1      5     0.50    *                   movq	176(%r15), %rax
 | 
			
		||||
 1      4     0.50                        vaddps	%zmm3, %zmm2, %zmm2
 | 
			
		||||
 2      8     0.50    *                   vmovups	(%rax,%rbx), %zmm3
 | 
			
		||||
 1      4     0.50                        vsubps	%zmm4, %zmm3, %zmm3
 | 
			
		||||
 2      1     1.00           *            vmovups	%zmm3, (%rax,%rbx)
 | 
			
		||||
 1      4     0.50                        vaddps	%zmm2, %zmm1, %zmm1
 | 
			
		||||
 2      8     0.50    *                   vmovups	32(%rax,%rbx), %zmm2
 | 
			
		||||
 1      4     0.50                        vsubps	%zmm5, %zmm2, %zmm2
 | 
			
		||||
 2      1     1.00           *            vmovups	%zmm2, 32(%rax,%rbx)
 | 
			
		||||
 2      8     0.50    *                   vmovups	64(%rax,%rbx), %zmm2
 | 
			
		||||
 1      4     0.50                        vsubps	%zmm1, %zmm2, %zmm1
 | 
			
		||||
 2      1     1.00           *            vmovups	%zmm1, 64(%rax,%rbx)
 | 
			
		||||
 1      1     0.25                        cmpq	%rdx, %r10
 | 
			
		||||
 1      1     0.50                        je	.LBB4_18
 | 
			
		||||
 1      5     0.50    *                   movq	160(%r15), %rdi
 | 
			
		||||
 1      1     0.25                        incq	%rdx
 | 
			
		||||
 1      1     0.50                        jmp	.LBB4_8
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Resources:
 | 
			
		||||
[0]   - SKXDivider
 | 
			
		||||
[1]   - SKXFPDivider
 | 
			
		||||
[2]   - SKXPort0
 | 
			
		||||
[3]   - SKXPort1
 | 
			
		||||
[4]   - SKXPort2
 | 
			
		||||
[5]   - SKXPort3
 | 
			
		||||
[6]   - SKXPort4
 | 
			
		||||
[7]   - SKXPort5
 | 
			
		||||
[8]   - SKXPort6
 | 
			
		||||
[9]   - SKXPort7
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Resource pressure per iteration:
 | 
			
		||||
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    
 | 
			
		||||
 -      -     52.01  14.97  8.49   8.51   3.00   52.02  11.00  2.00   
 | 
			
		||||
 | 
			
		||||
Resource pressure by instruction:
 | 
			
		||||
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
 | 
			
		||||
 -      -      -      -     0.49   0.51    -      -      -      -     movslq	(%r11,%rdx,4), %rax
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     movq	%rax, %rsi
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     shlq	$5, %rsi
 | 
			
		||||
 -      -      -     1.00    -      -      -      -      -      -     leaq	(%rsi,%rsi,2), %rbx
 | 
			
		||||
 -      -     0.01   0.99   0.50   0.50    -      -      -      -     vmovups	(%rdi,%rbx), %zmm15
 | 
			
		||||
 -      -      -      -     0.50   0.50    -     1.00    -      -     vmovups	32(%rdi,%rbx), %zmm16
 | 
			
		||||
 -      -      -     1.00   0.50   0.50    -      -      -      -     vmovups	64(%rdi,%rbx), %zmm27
 | 
			
		||||
 -      -      -     0.99   0.51   0.49    -     0.01    -      -     vmovups	128(%rsp), %zmm1
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.99    -      -     vsubps	%zmm15, %zmm1, %zmm24
 | 
			
		||||
 -      -      -     1.00   0.49   0.51    -      -      -      -     vmovups	320(%rsp), %zmm1
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vsubps	%zmm16, %zmm1, %zmm25
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.99    -      -     vsubps	%zmm27, %zmm9, %zmm26
 | 
			
		||||
 -      -     0.01   0.99   0.51   0.49    -      -      -      -     vmovups	(%rsp), %zmm1
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vsubps	%zmm15, %zmm1, %zmm21
 | 
			
		||||
 -      -      -      -     0.49   0.51    -     1.00    -      -     vmovups	256(%rsp), %zmm1
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.99    -      -     vsubps	%zmm16, %zmm1, %zmm22
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vsubps	%zmm27, %zmm10, %zmm23
 | 
			
		||||
 -      -      -     1.00   0.51   0.49    -      -      -      -     vmovups	448(%rsp), %zmm1
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.99    -      -     vsubps	%zmm15, %zmm1, %zmm17
 | 
			
		||||
 -      -     0.01    -     0.49   0.51    -     0.99    -      -     vmovups	192(%rsp), %zmm1
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vsubps	%zmm16, %zmm1, %zmm19
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.99    -      -     vsubps	%zmm27, %zmm11, %zmm20
 | 
			
		||||
 -      -     0.99    -     0.50   0.50    -     0.01    -      -     vmovups	384(%rsp), %zmm1
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vsubps	%zmm15, %zmm1, %zmm18
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.99    -      -     vsubps	%zmm16, %zmm8, %zmm16
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vsubps	%zmm27, %zmm12, %zmm15
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm26, %zmm26, %zmm27
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vfmadd231ps	%zmm25, %zmm25, %zmm27
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vfmadd231ps	%zmm24, %zmm24, %zmm27
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulps	%zmm23, %zmm23, %zmm28
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vfmadd231ps	%zmm22, %zmm22, %zmm28
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.99    -      -     vfmadd231ps	%zmm21, %zmm21, %zmm28
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.99    -      -     vmulps	%zmm20, %zmm20, %zmm29
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.99    -      -     vfmadd231ps	%zmm19, %zmm19, %zmm29
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vfmadd231ps	%zmm17, %zmm17, %zmm29
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.99    -      -     vmulps	%zmm15, %zmm15, %zmm30
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.99    -      -     vfmadd231ps	%zmm16, %zmm16, %zmm30
 | 
			
		||||
 -      -     2.00    -      -      -      -     1.00    -      -     vrcp14ps	%zmm27, %zmm31
 | 
			
		||||
 -      -     2.00    -      -      -      -     1.00    -      -     vrcp14ps	%zmm28, %zmm1
 | 
			
		||||
 -      -     2.00    -      -      -      -     1.00    -      -     vrcp14ps	%zmm29, %zmm2
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vfmadd231ps	%zmm18, %zmm18, %zmm30
 | 
			
		||||
 -      -     2.00    -      -      -      -     1.00    -      -     vrcp14ps	%zmm30, %zmm3
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulps	%zmm31, %zmm6, %zmm4
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm4, %zmm31, %zmm4
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm4, %zmm31, %zmm4
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vaddps	%zmm13, %zmm4, %zmm5
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulps	%zmm31, %zmm7, %zmm31
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm5, %zmm31, %zmm5
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm1, %zmm6, %zmm31
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm31, %zmm1, %zmm31
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vmulps	%zmm31, %zmm1, %zmm31
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm5, %zmm4, %zmm4
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vaddps	%zmm13, %zmm31, %zmm5
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulps	%zmm1, %zmm7, %zmm1
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulps	%zmm5, %zmm1, %zmm1
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulps	%zmm2, %zmm6, %zmm5
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulps	%zmm5, %zmm2, %zmm5
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulps	%zmm5, %zmm2, %zmm5
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vmulps	%zmm1, %zmm31, %zmm1
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vaddps	%zmm13, %zmm5, %zmm31
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm2, %zmm7, %zmm2
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulps	%zmm31, %zmm2, %zmm2
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulps	%zmm3, %zmm6, %zmm31
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm31, %zmm3, %zmm31
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm31, %zmm3, %zmm31
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm2, %zmm5, %zmm2
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vaddps	%zmm13, %zmm31, %zmm5
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm3, %zmm7, %zmm3
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm5, %zmm3, %zmm3
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm3, %zmm31, %zmm3
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -     xorl	%esi, %esi
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -     xorl	%edi, %edi
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     testl	$2147483647, %eax
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     sete	%sil
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     setne	%dil
 | 
			
		||||
 -      -      -     1.00    -      -      -      -      -      -     movl	$255, %eax
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     cmovel	%r8d, %eax
 | 
			
		||||
 -      -      -     1.00    -      -      -      -      -      -     movl	$255, %ecx
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     cmovel	%r9d, %ecx
 | 
			
		||||
 -      -      -     1.00    -      -      -      -      -      -     xorl	$255, %esi
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     kmovd	%esi, %k1
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vcmpltps	%zmm0, %zmm27, %k1 {%k1}
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulps	%zmm14, %zmm4, %zmm4
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm4, %zmm24, %zmm5 {%k1} {z}
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulps	%zmm4, %zmm25, %zmm24 {%k1} {z}
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm4, %zmm26, %zmm4 {%k1} {z}
 | 
			
		||||
 -      -      -     1.00    -      -      -      -      -      -     leal	(%rdi,%rdi,2), %esi
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     orl	$252, %esi
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     kmovd	%esi, %k1
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vcmpltps	%zmm0, %zmm28, %k1 {%k1}
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vmulps	%zmm14, %zmm1, %zmm1
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vmulps	%zmm1, %zmm21, %zmm21 {%k1} {z}
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vaddps	%zmm21, %zmm5, %zmm5
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.99    -      -     vmulps	%zmm1, %zmm22, %zmm21 {%k1} {z}
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vaddps	%zmm21, %zmm24, %zmm21
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vmulps	%zmm1, %zmm23, %zmm1 {%k1} {z}
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vaddps	%zmm1, %zmm4, %zmm1
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     kmovd	%eax, %k1
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vcmpltps	%zmm0, %zmm29, %k1 {%k1}
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulps	%zmm14, %zmm2, %zmm2
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm2, %zmm17, %zmm4 {%k1} {z}
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulps	%zmm2, %zmm19, %zmm17 {%k1} {z}
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm2, %zmm20, %zmm2 {%k1} {z}
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     kmovd	%ecx, %k1
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vcmpltps	%zmm0, %zmm30, %k1 {%k1}
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm14, %zmm3, %zmm3
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulps	%zmm3, %zmm18, %zmm18 {%k1} {z}
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vaddps	%zmm18, %zmm4, %zmm4
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vaddps	%zmm4, %zmm5, %zmm4
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vmulps	%zmm3, %zmm16, %zmm5 {%k1} {z}
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vaddps	%zmm5, %zmm17, %zmm5
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vaddps	%zmm5, %zmm21, %zmm5
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmulps	%zmm3, %zmm15, %zmm3 {%k1} {z}
 | 
			
		||||
 -      -      -      -     1.00    -      -      -      -      -     movq	176(%r15), %rax
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vaddps	%zmm3, %zmm2, %zmm2
 | 
			
		||||
 -      -      -     1.00   0.50   0.50    -      -      -      -     vmovups	(%rax,%rbx), %zmm3
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vsubps	%zmm4, %zmm3, %zmm3
 | 
			
		||||
 -      -      -      -      -      -     1.00    -      -     1.00   vmovups	%zmm3, (%rax,%rbx)
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vaddps	%zmm2, %zmm1, %zmm1
 | 
			
		||||
 -      -      -     1.00   0.50   0.50    -      -      -      -     vmovups	32(%rax,%rbx), %zmm2
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vsubps	%zmm5, %zmm2, %zmm2
 | 
			
		||||
 -      -      -      -      -      -     1.00    -      -     1.00   vmovups	%zmm2, 32(%rax,%rbx)
 | 
			
		||||
 -      -      -     1.00   0.50   0.50    -      -      -      -     vmovups	64(%rax,%rbx), %zmm2
 | 
			
		||||
 -      -     0.99    -      -      -      -     0.01    -      -     vsubps	%zmm1, %zmm2, %zmm1
 | 
			
		||||
 -      -      -      -      -     1.00   1.00    -      -      -     vmovups	%zmm1, 64(%rax,%rbx)
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     cmpq	%rdx, %r10
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     je	.LBB4_18
 | 
			
		||||
 -      -      -      -     0.50   0.50    -      -      -      -     movq	160(%r15), %rdi
 | 
			
		||||
 -      -      -     1.00    -      -      -      -      -      -     incq	%rdx
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     jmp	.LBB4_8
 | 
			
		||||
@@ -1,116 +0,0 @@
 | 
			
		||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
 | 
			
		||||
Analyzed file:      gromacs-icx-avx512-sp.s
 | 
			
		||||
Architecture:       ICX
 | 
			
		||||
Timestamp:          2023-02-14 12:51:43
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
 | 
			
		||||
 * - Instruction micro-ops not bound to a port
 | 
			
		||||
 X - No throughput/latency information for this instruction in data file
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Combined Analysis Report
 | 
			
		||||
------------------------
 | 
			
		||||
                                                 Port pressure in cycles                                                  
 | 
			
		||||
     |  0   - 0DV  |  1   - 1DV  |  2   -  2D  |  3   -  3D  |  4   |   5   |  6   |  7   |  8   |  9   ||  CP  | LCD  |
 | 
			
		||||
------------------------------------------------------------------------------------------------------------------------
 | 
			
		||||
1338 |             |             |             |             |      |       |      |      |      |      ||      |      |   # pointer_increment=64 0f91ac4f7fe1a70d0c899f7f3e745649
 | 
			
		||||
1339 |             |             |             |             |      |       |      |      |      |      ||      |      |   # LLVM-MCA-BEGIN
 | 
			
		||||
1340 |             |             |             |             |      |       |      |      |      |      ||      |      |   .LBB2_12:                               #   Parent Loop BB2_7 Depth=1
 | 
			
		||||
1341 |             |             |             |             |      |       |      |      |      |      ||      |      |   # =>  This Inner Loop Header: Depth=2
 | 
			
		||||
1342 |             |             | 0.50   0.50 | 0.50   0.50 |      |       |      |      |      |      ||  5.0 |      |   movslq (%r11,%rax,4), %rcx
 | 
			
		||||
1343 | 0.00        | 0.75        |             |             |      | 0.000 | 0.25 |      |      |      ||  6.0 |      |   leaq (%rcx,%rcx,2), %rdx
 | 
			
		||||
1344 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||  1.0 |      |   shlq $5, %rdx
 | 
			
		||||
1345 |             |             | 0.50   0.50 | 0.50   0.50 |      |       |      |      |      |      ||      |      |   vmovupd (%rsi,%rdx), %zmm16
 | 
			
		||||
1346 |             |             | 0.50   0.50 | 0.50   0.50 |      |       |      |      |      |      ||  5.0 |      |   vbroadcastf64x4 64(%rsi,%rdx), %zmm20   # zmm20 = mem[0,1,2,3,0,1,2,3]
 | 
			
		||||
1347 |             |             | 0.50   0.50 | 0.50   0.50 |      |       |      |      |      |      ||      |      |   vbroadcastf64x4 (%rsi,%rdx), %zmm19     # zmm19 = mem[0,1,2,3,0,1,2,3]
 | 
			
		||||
1348 |             |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vshuff64x2 $238, %zmm16, %zmm16, %zmm21 # zmm21 = zmm16[4,5,6,7,4,5,6,7]
 | 
			
		||||
1349 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vsubps %zmm19, %zmm6, %zmm18
 | 
			
		||||
1350 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vsubps %zmm21, %zmm10, %zmm17
 | 
			
		||||
1351 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vsubps %zmm20, %zmm14, %zmm16
 | 
			
		||||
1352 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vmulps %zmm16, %zmm16, %zmm22
 | 
			
		||||
1353 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vfmadd231ps %zmm17, %zmm17, %zmm22  # zmm22 = (zmm17 * zmm17) + zmm22
 | 
			
		||||
1354 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vfmadd231ps %zmm18, %zmm18, %zmm22  # zmm22 = (zmm18 * zmm18) + zmm22
 | 
			
		||||
1355 | 2.50        |             |             |             |      | 0.500 |      |      |      |      ||  6.0 |      |   vrcp14ps %zmm22, %zmm23
 | 
			
		||||
1356 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vmulps %zmm23, %zmm26, %zmm24
 | 
			
		||||
1357 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vmulps %zmm24, %zmm23, %zmm24
 | 
			
		||||
1358 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vmulps %zmm24, %zmm23, %zmm24
 | 
			
		||||
1359 | 0.75        |             |             |             |      | 0.250 |      |      |      |      ||  4.0 |      |   vaddps %zmm1, %zmm24, %zmm25
 | 
			
		||||
1360 | 1.00        |             |             |             |      | 0.000 |      |      |      |      ||      |      |   vmulps %zmm23, %zmm27, %zmm23
 | 
			
		||||
1361 | 1.00        |             |             |             |      | 0.000 |      |      |      |      ||  4.0 |      |   vmulps %zmm25, %zmm23, %zmm23
 | 
			
		||||
1362 | 1.00        |             |             |             |      | 0.000 |      |      |      |      ||  4.0 |      |   vmulps %zmm23, %zmm24, %zmm23
 | 
			
		||||
1363 | 0.00        | 0.50        |             |             |      | 0.000 | 0.50 |      |      |      ||      |      |   leal (%rcx,%rcx), %edx
 | 
			
		||||
1364 | 0.00        | 0.50        |             |             |      | 0.000 | 0.50 |      |      |      ||      |      |   xorl %edi, %edi
 | 
			
		||||
1365 | 0.00        | 0.50        |             |             |      | 0.000 | 0.50 |      |      |      ||      |      |   xorl %ebp, %ebp
 | 
			
		||||
1366 | 0.00        | 0.75        |             |             |      | 0.000 | 0.25 |      |      |      ||      |      |   cmpq %rdx, %r12
 | 
			
		||||
1367 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   setne %dil
 | 
			
		||||
1368 | 0.00        | 0.75        |             |             |      | 0.000 | 0.25 |      |      |      ||      |      |   leal 1(%rcx,%rcx), %ecx
 | 
			
		||||
1369 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   sete %bpl
 | 
			
		||||
1370 | 0.00        | 0.50        |             |             |      | 0.000 | 0.50 |      |      |      ||      |      |   xorl %edx, %edx
 | 
			
		||||
1371 | 0.00        | 0.50        |             |             |      | 0.000 | 0.50 |      |      |      ||      |      |   xorl %ebx, %ebx
 | 
			
		||||
1372 | 0.00        | 0.75        |             |             |      | 0.000 | 0.25 |      |      |      ||      |      |   cmpq %rcx, %r12
 | 
			
		||||
1373 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   sete %dl
 | 
			
		||||
1374 | 0.00        | 1.00        |             |             |      | 0.000 | 0.00 |      |      |      ||      |      |   movl $0, %ecx
 | 
			
		||||
1375 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   setne %bl
 | 
			
		||||
1376 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   cmovel %r8d, %ecx
 | 
			
		||||
1377 | 0.00        | 0.75        |             |             |      | 0.000 | 0.25 |      |      |      ||      |      |   movl %ebx, %r14d
 | 
			
		||||
1378 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   shll $4, %r14d
 | 
			
		||||
1379 | 0.00        | 0.50        |             |             |      | 0.000 | 0.50 |      |      |      ||      |      |   subl %ebp, %r14d
 | 
			
		||||
1380 | 0.00        | 0.75        |             |             |      | 0.000 | 0.25 |      |      |      ||      |      |   leal (%rcx,%rdi,2), %ecx
 | 
			
		||||
1381 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   shll $8, %ecx
 | 
			
		||||
1382 | 0.00        | 0.50        |             |             |      | 0.000 | 0.50 |      |      |      ||      |      |   addl $239, %r14d
 | 
			
		||||
1383 | 0.00        | 0.50        |             |             |      | 0.000 | 0.50 |      |      |      ||      |      |   addl $-768, %ecx                     # imm = 0xFD00
 | 
			
		||||
1384 | 0.00        | 1.00        |             |             |      | 0.000 | 0.00 |      |      |      ||      |      |   orl %r14d, %ecx
 | 
			
		||||
1385 | 1.00        |             |             |             |      |       |      |      |      |      ||      |      |   kmovd %ecx, %k2
 | 
			
		||||
1386 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vcmpltps %zmm0, %zmm22, %k2 {%k2}
 | 
			
		||||
1387 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vsubps %zmm21, %zmm11, %zmm21
 | 
			
		||||
1388 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vsubps %zmm20, %zmm15, %zmm20
 | 
			
		||||
1389 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vsubps %zmm19, %zmm7, %zmm19
 | 
			
		||||
1390 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vmulps %zmm2, %zmm23, %zmm22
 | 
			
		||||
1391 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vfmadd231ps %zmm22, %zmm18, %zmm12 {%k2} # zmm12 {%k2} = (zmm18 * zmm22) + zmm12
 | 
			
		||||
1392 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulps %zmm20, %zmm20, %zmm18
 | 
			
		||||
1393 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmadd231ps %zmm21, %zmm21, %zmm18  # zmm18 = (zmm21 * zmm21) + zmm18
 | 
			
		||||
1394 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmadd231ps %zmm19, %zmm19, %zmm18  # zmm18 = (zmm19 * zmm19) + zmm18
 | 
			
		||||
1395 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmadd231ps %zmm22, %zmm17, %zmm9 {%k2} # zmm9 {%k2} = (zmm17 * zmm22) + zmm9
 | 
			
		||||
1396 | 2.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vrcp14ps %zmm18, %zmm17
 | 
			
		||||
1397 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmadd231ps %zmm22, %zmm16, %zmm5 {%k2} # zmm5 {%k2} = (zmm16 * zmm22) + zmm5
 | 
			
		||||
1398 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulps %zmm17, %zmm26, %zmm16
 | 
			
		||||
1399 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulps %zmm16, %zmm17, %zmm16
 | 
			
		||||
1400 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulps %zmm16, %zmm17, %zmm16
 | 
			
		||||
1401 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vaddps %zmm1, %zmm16, %zmm22
 | 
			
		||||
1402 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vmulps %zmm17, %zmm27, %zmm17
 | 
			
		||||
1403 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vmulps %zmm22, %zmm17, %zmm17
 | 
			
		||||
1404 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vmulps %zmm17, %zmm16, %zmm16
 | 
			
		||||
1405 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   shll $6, %ebx
 | 
			
		||||
1406 | 0.00        | 1.00        |             |             |      | 0.000 | 0.00 |      |      |      ||      |      |   leal (%rbx,%rdi,4), %ecx
 | 
			
		||||
1407 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   shll $7, %edx
 | 
			
		||||
1408 | 0.00        | 1.00        |             |             |      | 0.000 | 0.00 |      |      |      ||      |      |   leal (%rdx,%rdi,8), %edx
 | 
			
		||||
1409 | 0.00        |             |             |             |      |       | 1.00 |      |      |      ||      |      |   shll $8, %edx
 | 
			
		||||
1410 | 0.00        | 1.00        |             |             |      | 0.000 | 0.00 |      |      |      ||      |      |   addl %edx, %ecx
 | 
			
		||||
1411 | 0.00        | 1.00        |             |             |      | 0.000 | 0.00 |      |      |      ||      |      |   addl $-2117, %ecx                    # imm = 0xF7BB
 | 
			
		||||
1412 | 1.00        |             |             |             |      |       |      |      |      |      ||      |      |   kmovd %ecx, %k2
 | 
			
		||||
1413 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vcmpltps %zmm0, %zmm18, %k2 {%k2}
 | 
			
		||||
1414 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vmulps %zmm2, %zmm16, %zmm16
 | 
			
		||||
1415 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vfmadd231ps %zmm16, %zmm19, %zmm13 {%k2} # zmm13 {%k2} = (zmm19 * zmm16) + zmm13
 | 
			
		||||
1416 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vfmadd231ps %zmm16, %zmm21, %zmm8 {%k2} # zmm8 {%k2} = (zmm21 * zmm16) + zmm8
 | 
			
		||||
1417 | 0.24        |             |             |             |      | 0.760 |      |      |      |      ||      |  4.0 |   vfmadd231ps %zmm16, %zmm20, %zmm4 {%k2} # zmm4 {%k2} = (zmm20 * zmm16) + zmm4
 | 
			
		||||
1418 | 0.00        | 1.00        |             |             |      | -0.01 | 0.00 |      |      |      ||      |      |   incq %rax
 | 
			
		||||
1419 | 0.00        | 1.00        |             |             |      | -0.01 | 0.00 |      |      |      ||      |      |   cmpq %rax, %r10
 | 
			
		||||
1420 |             |             |             |             |      |       |      |      |      |      ||      |      | * jne .LBB2_12
 | 
			
		||||
1421 |             |             |             |             |      |       |      |      |      |      ||      |      |   # LLVM-MCA-END
 | 
			
		||||
 | 
			
		||||
       22.5          16.5          2.00   2.00   2.00   2.00          22.49   16.5                           71    4.0  
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Loop-Carried Dependencies Analysis Report
 | 
			
		||||
-----------------------------------------
 | 
			
		||||
1417 |  4.0 | vfmadd231ps	%zmm16, %zmm20, %zmm4 {%k2} # zmm4 {%k2} = (zmm20 * zmm16) + zmm4| [1417]
 | 
			
		||||
1416 |  4.0 | vfmadd231ps	%zmm16, %zmm21, %zmm8 {%k2} # zmm8 {%k2} = (zmm21 * zmm16) + zmm8| [1416]
 | 
			
		||||
1415 |  4.0 | vfmadd231ps	%zmm16, %zmm19, %zmm13 {%k2} # zmm13 {%k2} = (zmm19 * zmm16) + zmm13| [1415]
 | 
			
		||||
1397 |  4.0 | vfmadd231ps	%zmm22, %zmm16, %zmm5 {%k2} # zmm5 {%k2} = (zmm16 * zmm22) + zmm5| [1397]
 | 
			
		||||
1395 |  4.0 | vfmadd231ps	%zmm22, %zmm17, %zmm9 {%k2} # zmm9 {%k2} = (zmm17 * zmm22) + zmm9| [1395]
 | 
			
		||||
1391 |  4.0 | vfmadd231ps	%zmm22, %zmm18, %zmm12 {%k2} # zmm12 {%k2} = (zmm18 * zmm22) + zmm12| [1391]
 | 
			
		||||
1418 |  1.0 | incq	%rax                           | [1418]
 | 
			
		||||
 | 
			
		||||
@@ -1,161 +0,0 @@
 | 
			
		||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
 | 
			
		||||
Analyzed file:      gromacs-icx-avx512-sp.s
 | 
			
		||||
Architecture:       CSX
 | 
			
		||||
Timestamp:          2023-02-10 16:31:04
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
 | 
			
		||||
 * - Instruction micro-ops not bound to a port
 | 
			
		||||
 X - No throughput/latency information for this instruction in data file
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Combined Analysis Report
 | 
			
		||||
------------------------
 | 
			
		||||
                                      Port pressure in cycles                                       
 | 
			
		||||
     |  0   - 0DV  |  1   |  2   -  2D  |  3   -  3D  |  4   |  5   |  6   |  7   ||  CP  | LCD  |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
1662 |             |      |             |             |      |      |      |      ||      |      |   # pointer_increment=64 0f91ac4f7fe1a70d0c899f7f3e745649
 | 
			
		||||
1663 |             |      |             |             |      |      |      |      ||      |      |   # LLVM-MCA-BEGIN
 | 
			
		||||
1664 |             |      |             |             |      |      |      |      ||      |      |   .LBB4_8:                                # =>This Inner Loop Header: Depth=1
 | 
			
		||||
1665 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||  4.0 |      |   movslq (%r11,%rdx,4), %rax
 | 
			
		||||
1666 | 0.00        | 0.50 |             |             |      | 0.00 | 0.50 |      ||  1.0 |      |   movq %rax, %rsi
 | 
			
		||||
1667 | 0.00        |      |             |             |      |      | 1.00 |      ||  1.0 |      |   shlq $5, %rsi
 | 
			
		||||
1668 |             | 1.00 |             |             |      | 0.00 |      |      ||  1.0 |      |   leaq (%rsi,%rsi,2), %rbx
 | 
			
		||||
1669 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovups (%rdi,%rbx), %zmm15             # AlignMOV convert to UnAlignMOV
 | 
			
		||||
1670 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovups 32(%rdi,%rbx), %zmm16           # AlignMOV convert to UnAlignMOV
 | 
			
		||||
1671 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||  4.0 |      |   vmovups 64(%rdi,%rbx), %zmm27           # AlignMOV convert to UnAlignMOV
 | 
			
		||||
1672 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovups 128(%rsp), %zmm1                # 64-byte Reload
 | 
			
		||||
1673 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubps %zmm15, %zmm1, %zmm24
 | 
			
		||||
1674 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovups 320(%rsp), %zmm1                # 64-byte Reload
 | 
			
		||||
1675 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubps %zmm16, %zmm1, %zmm25
 | 
			
		||||
1676 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vsubps %zmm27, %zmm9, %zmm26
 | 
			
		||||
1677 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovups (%rsp), %zmm1                   # 64-byte Reload
 | 
			
		||||
1678 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubps %zmm15, %zmm1, %zmm21
 | 
			
		||||
1679 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovups 256(%rsp), %zmm1                # 64-byte Reload
 | 
			
		||||
1680 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubps %zmm16, %zmm1, %zmm22
 | 
			
		||||
1681 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubps %zmm27, %zmm10, %zmm23
 | 
			
		||||
1682 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovups 448(%rsp), %zmm1                # 64-byte Reload
 | 
			
		||||
1683 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubps %zmm15, %zmm1, %zmm17
 | 
			
		||||
1684 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovups 192(%rsp), %zmm1                # 64-byte Reload
 | 
			
		||||
1685 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubps %zmm16, %zmm1, %zmm19
 | 
			
		||||
1686 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubps %zmm27, %zmm11, %zmm20
 | 
			
		||||
1687 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovups 384(%rsp), %zmm1                # 64-byte Reload
 | 
			
		||||
1688 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubps %zmm15, %zmm1, %zmm18
 | 
			
		||||
1689 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubps %zmm16, %zmm8, %zmm16
 | 
			
		||||
1690 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubps %zmm27, %zmm12, %zmm15
 | 
			
		||||
1691 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulps %zmm26, %zmm26, %zmm27
 | 
			
		||||
1692 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vfmadd231ps %zmm25, %zmm25, %zmm27  # zmm27 = (zmm25 * zmm25) + zmm27
 | 
			
		||||
1693 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vfmadd231ps %zmm24, %zmm24, %zmm27  # zmm27 = (zmm24 * zmm24) + zmm27
 | 
			
		||||
1694 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm23, %zmm23, %zmm28
 | 
			
		||||
1695 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231ps %zmm22, %zmm22, %zmm28  # zmm28 = (zmm22 * zmm22) + zmm28
 | 
			
		||||
1696 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231ps %zmm21, %zmm21, %zmm28  # zmm28 = (zmm21 * zmm21) + zmm28
 | 
			
		||||
1697 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm20, %zmm20, %zmm29
 | 
			
		||||
1698 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231ps %zmm19, %zmm19, %zmm29  # zmm29 = (zmm19 * zmm19) + zmm29
 | 
			
		||||
1699 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231ps %zmm17, %zmm17, %zmm29  # zmm29 = (zmm17 * zmm17) + zmm29
 | 
			
		||||
1700 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm15, %zmm15, %zmm30
 | 
			
		||||
1701 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231ps %zmm16, %zmm16, %zmm30  # zmm30 = (zmm16 * zmm16) + zmm30
 | 
			
		||||
1702 | 2.50        |      |             |             |      | 0.50 |      |      ||  8.0 |      |   vrcp14ps %zmm27, %zmm31
 | 
			
		||||
1703 | 2.50        |      |             |             |      | 0.50 |      |      ||      |      |   vrcp14ps %zmm28, %zmm1
 | 
			
		||||
1704 | 2.50        |      |             |             |      | 0.50 |      |      ||      |      |   vrcp14ps %zmm29, %zmm2
 | 
			
		||||
1705 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd231ps %zmm18, %zmm18, %zmm30  # zmm30 = (zmm18 * zmm18) + zmm30
 | 
			
		||||
1706 | 2.50        |      |             |             |      | 0.50 |      |      ||      |      |   vrcp14ps %zmm30, %zmm3
 | 
			
		||||
1707 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulps %zmm31, %zmm6, %zmm4
 | 
			
		||||
1708 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulps %zmm4, %zmm31, %zmm4
 | 
			
		||||
1709 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulps %zmm4, %zmm31, %zmm4
 | 
			
		||||
1710 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vaddps %zmm13, %zmm4, %zmm5
 | 
			
		||||
1711 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm31, %zmm7, %zmm31
 | 
			
		||||
1712 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulps %zmm5, %zmm31, %zmm5
 | 
			
		||||
1713 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm1, %zmm6, %zmm31
 | 
			
		||||
1714 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm31, %zmm1, %zmm31
 | 
			
		||||
1715 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm31, %zmm1, %zmm31
 | 
			
		||||
1716 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulps %zmm5, %zmm4, %zmm4
 | 
			
		||||
1717 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vaddps %zmm13, %zmm31, %zmm5
 | 
			
		||||
1718 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm1, %zmm7, %zmm1
 | 
			
		||||
1719 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm5, %zmm1, %zmm1
 | 
			
		||||
1720 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm2, %zmm6, %zmm5
 | 
			
		||||
1721 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm5, %zmm2, %zmm5
 | 
			
		||||
1722 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm5, %zmm2, %zmm5
 | 
			
		||||
1723 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm1, %zmm31, %zmm1
 | 
			
		||||
1724 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vaddps %zmm13, %zmm5, %zmm31
 | 
			
		||||
1725 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm2, %zmm7, %zmm2
 | 
			
		||||
1726 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm31, %zmm2, %zmm2
 | 
			
		||||
1727 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm3, %zmm6, %zmm31
 | 
			
		||||
1728 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm31, %zmm3, %zmm31
 | 
			
		||||
1729 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm31, %zmm3, %zmm31
 | 
			
		||||
1730 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm2, %zmm5, %zmm2
 | 
			
		||||
1731 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vaddps %zmm13, %zmm31, %zmm5
 | 
			
		||||
1732 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm3, %zmm7, %zmm3
 | 
			
		||||
1733 | 1.00        |      |             |             |      | 0.00 |      |      ||      |      |   vmulps %zmm5, %zmm3, %zmm3
 | 
			
		||||
1734 | 1.00        |      |             |             |      | 0.00 |      |      ||      |      |   vmulps %zmm3, %zmm31, %zmm3
 | 
			
		||||
1735 | 0.00        | 0.50 |             |             |      | 0.00 | 0.50 |      ||      |      |   xorl %esi, %esi
 | 
			
		||||
1736 | 0.00        | 0.50 |             |             |      | 0.00 | 0.50 |      ||      |      |   xorl %edi, %edi
 | 
			
		||||
1737 | 0.00        | 1.00 |             |             |      | 0.00 | 0.00 |      ||      |      |   testl $2147483647, %eax               # imm = 0x7FFFFFFF
 | 
			
		||||
1738 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   sete %sil
 | 
			
		||||
1739 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   setne %dil
 | 
			
		||||
1740 | 0.00        | 0.75 |             |             |      | 0.00 | 0.25 |      ||      |      |   movl $255, %eax
 | 
			
		||||
1741 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   cmovel %r8d, %eax
 | 
			
		||||
1742 | 0.00        | 0.75 |             |             |      | 0.00 | 0.25 |      ||      |      |   movl $255, %ecx
 | 
			
		||||
1743 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   cmovel %r9d, %ecx
 | 
			
		||||
1744 | 0.00        | 0.25 |             |             |      | 0.00 | 0.75 |      ||      |      |   xorl $255, %esi
 | 
			
		||||
1745 | 1.00        |      |             |             |      |      |      |      ||      |      |   kmovd %esi, %k1
 | 
			
		||||
1746 |             |      |             |             |      | 1.00 |      |      ||      |      |   vcmpltps %zmm0, %zmm27, %k1 {%k1}
 | 
			
		||||
1747 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulps %zmm14, %zmm4, %zmm4
 | 
			
		||||
1748 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulps %zmm4, %zmm24, %zmm5 {%k1} {z}
 | 
			
		||||
1749 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm4, %zmm25, %zmm24 {%k1} {z}
 | 
			
		||||
1750 | 0.25        |      |             |             |      | 0.75 |      |      ||      |      |   vmulps %zmm4, %zmm26, %zmm4 {%k1} {z}
 | 
			
		||||
1751 |             | 1.00 |             |             |      | 0.00 |      |      ||      |      |   leal (%rdi,%rdi,2), %esi
 | 
			
		||||
1752 | 0.00        | 0.75 |             |             |      | 0.00 | 0.25 |      ||      |      |   orl $252, %esi
 | 
			
		||||
1753 | 1.00        |      |             |             |      |      |      |      ||      |      |   kmovd %esi, %k1
 | 
			
		||||
1754 |             |      |             |             |      | 1.00 |      |      ||      |      |   vcmpltps %zmm0, %zmm28, %k1 {%k1}
 | 
			
		||||
1755 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm14, %zmm1, %zmm1
 | 
			
		||||
1756 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm1, %zmm21, %zmm21 {%k1} {z}
 | 
			
		||||
1757 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vaddps %zmm21, %zmm5, %zmm5
 | 
			
		||||
1758 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm1, %zmm22, %zmm21 {%k1} {z}
 | 
			
		||||
1759 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vaddps %zmm21, %zmm24, %zmm21
 | 
			
		||||
1760 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm1, %zmm23, %zmm1 {%k1} {z}
 | 
			
		||||
1761 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vaddps %zmm1, %zmm4, %zmm1
 | 
			
		||||
1762 | 1.00        |      |             |             |      |      |      |      ||      |      |   kmovd %eax, %k1
 | 
			
		||||
1763 |             |      |             |             |      | 1.00 |      |      ||      |      |   vcmpltps %zmm0, %zmm29, %k1 {%k1}
 | 
			
		||||
1764 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm14, %zmm2, %zmm2
 | 
			
		||||
1765 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm2, %zmm17, %zmm4 {%k1} {z}
 | 
			
		||||
1766 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm2, %zmm19, %zmm17 {%k1} {z}
 | 
			
		||||
1767 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm2, %zmm20, %zmm2 {%k1} {z}
 | 
			
		||||
1768 | 1.00        |      |             |             |      |      |      |      ||      |      |   kmovd %ecx, %k1
 | 
			
		||||
1769 |             |      |             |             |      | 1.00 |      |      ||      |      |   vcmpltps %zmm0, %zmm30, %k1 {%k1}
 | 
			
		||||
1770 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm14, %zmm3, %zmm3
 | 
			
		||||
1771 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulps %zmm3, %zmm18, %zmm18 {%k1} {z}
 | 
			
		||||
1772 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vaddps %zmm18, %zmm4, %zmm4
 | 
			
		||||
1773 | 0.25        |      |             |             |      | 0.75 |      |      ||  4.0 |      |   vaddps %zmm4, %zmm5, %zmm4
 | 
			
		||||
1774 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vmulps %zmm3, %zmm16, %zmm5 {%k1} {z}
 | 
			
		||||
1775 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vaddps %zmm5, %zmm17, %zmm5
 | 
			
		||||
1776 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vaddps %zmm5, %zmm21, %zmm5
 | 
			
		||||
1777 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vmulps %zmm3, %zmm15, %zmm3 {%k1} {z}
 | 
			
		||||
1778 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   movq 176(%r15), %rax
 | 
			
		||||
1779 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vaddps %zmm3, %zmm2, %zmm2
 | 
			
		||||
1780 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovups (%rax,%rbx), %zmm3              # AlignMOV convert to UnAlignMOV
 | 
			
		||||
1781 | 0.00        |      |             |             |      | 1.00 |      |      ||  4.0 |      |   vsubps %zmm4, %zmm3, %zmm3
 | 
			
		||||
1782 |             |      | 0.50        | 0.50        | 1.00 |      |      |      ||  0.0 |      |   vmovups %zmm3, (%rax,%rbx)              # AlignMOV convert to UnAlignMOV
 | 
			
		||||
1783 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vaddps %zmm2, %zmm1, %zmm1
 | 
			
		||||
1784 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovups 32(%rax,%rbx), %zmm2            # AlignMOV convert to UnAlignMOV
 | 
			
		||||
1785 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vsubps %zmm5, %zmm2, %zmm2
 | 
			
		||||
1786 |             |      | 0.50        | 0.50        | 1.00 |      |      |      ||      |      |   vmovups %zmm2, 32(%rax,%rbx)            # AlignMOV convert to UnAlignMOV
 | 
			
		||||
1787 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   vmovups 64(%rax,%rbx), %zmm2            # AlignMOV convert to UnAlignMOV
 | 
			
		||||
1788 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vsubps %zmm1, %zmm2, %zmm1
 | 
			
		||||
1789 |             |      | 0.50        | 0.50        | 1.00 |      |      |      ||      |      |   vmovups %zmm1, 64(%rax,%rbx)            # AlignMOV convert to UnAlignMOV
 | 
			
		||||
1790 | 0.00        | 1.00 |             |             |      | 0.00 | 0.00 |      ||      |      |   cmpq %rdx, %r10
 | 
			
		||||
1791 |             |      |             |             |      |      |      |      ||      |      | * je .LBB4_18
 | 
			
		||||
1792 |             |      |             |             |      |      |      |      ||      |      |   # %bb.9:                                #   in Loop: Header=BB4_8 Depth=1
 | 
			
		||||
1793 |             |      | 0.50   0.50 | 0.50   0.50 |      |      |      |      ||      |      |   movq 160(%r15), %rdi
 | 
			
		||||
1794 | 0.00        | 1.00 |             |             |      | 0.00 | 0.00 |      ||      |  1.0 |   incq %rdx
 | 
			
		||||
1795 | 0.00        |      |             |             |      |      | 1.00 |      ||      |      |   jmp .LBB4_8
 | 
			
		||||
1796 |             |      |             |             |      |      |      |      ||      |      |   # LLVM-MCA-END
 | 
			
		||||
 | 
			
		||||
       50.0          9.00   9.50   8.00   9.50   8.00   3.00   50.0   9.00           79.0    1.0  
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Loop-Carried Dependencies Analysis Report
 | 
			
		||||
-----------------------------------------
 | 
			
		||||
1794 |  1.0 | incq	%rdx                           | [1794]
 | 
			
		||||
 | 
			
		||||
@@ -1,88 +0,0 @@
 | 
			
		||||
Intel(R) Architecture Code Analyzer Version -  v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
 | 
			
		||||
Analyzed File -  lammps-icc-avx2.o
 | 
			
		||||
Binary Format - 64Bit
 | 
			
		||||
Architecture  -  SKX
 | 
			
		||||
Analysis Type - Throughput
 | 
			
		||||
 | 
			
		||||
Throughput Analysis Report
 | 
			
		||||
--------------------------
 | 
			
		||||
Block Throughput: 25.58 Cycles       Throughput Bottleneck: Backend
 | 
			
		||||
Loop Count:  22
 | 
			
		||||
Port Binding In Cycles Per Iteration:
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
|  Port  |   0   -  DV   |   1   |   2   -  D    |   3   -  D    |   4   |   5   |   6   |   7   |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
| Cycles | 13.7     8.0  | 13.6  |  5.5     5.5  |  5.5     5.5  |  0.0  | 13.7  |  7.0  |  0.0  |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
DV - Divider pipe (on port 0)
 | 
			
		||||
D - Data fetch pipe (on ports 2 and 3)
 | 
			
		||||
F - Macro Fusion with the previous instruction occurred
 | 
			
		||||
* - instruction micro-ops not bound to a port
 | 
			
		||||
^ - Micro Fusion occurred
 | 
			
		||||
# - ESP Tracking sync uop was issued
 | 
			
		||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
 | 
			
		||||
X - instruction not supported, was not accounted in Analysis
 | 
			
		||||
 | 
			
		||||
| Num Of   |                    Ports pressure in cycles                         |      |
 | 
			
		||||
|  Uops    |  0  - DV    |  1   |  2  -  D    |  3  -  D    |  4   |  5   |  6   |  7   |
 | 
			
		||||
-----------------------------------------------------------------------------------------
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovdqu xmm0, xmmword ptr [rbx+rdx*4]
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmovq rcx, xmm0
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vpunpckhqdq xmm2, xmm0, xmm0
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | vmovq r15, xmm2
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | mov r8d, ecx
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shr rcx, 0x20
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | lea r14d, ptr [rcx+rcx*2]
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | lea r8d, ptr [r8+r8*2]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | movsxd rcx, r8d
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | movsxd r8, r14d
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | mov r14d, r15d
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | shr r15, 0x20
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovups xmm7, xmmword ptr [r11+rcx*8]
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovups xmm6, xmmword ptr [r11+r8*8]
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovq xmm14, qword ptr [r11+rcx*8+0x10]
 | 
			
		||||
|   1      |             | 0.3  |             |             |      | 0.7  |      |      | lea r14d, ptr [r14+r14*2]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | movsxd r14, r14d
 | 
			
		||||
|   1      |             | 0.7  |             |             |      | 0.3  |      |      | lea r15d, ptr [r15+r15*2]
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | movsxd r15, r15d
 | 
			
		||||
|   2      |             |      | 0.5     0.5 | 0.5     0.5 |      | 1.0  |      |      | vmovhpd xmm15, xmm14, qword ptr [r11+r8*8+0x10]
 | 
			
		||||
|   2      |             | 1.0  | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vinsertf128 ymm1, ymm7, xmmword ptr [r11+r14*8], 0x1
 | 
			
		||||
|   1      |             |      | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmovq xmm0, qword ptr [r11+r14*8+0x10]
 | 
			
		||||
|   2      |             | 0.3  | 0.5     0.5 | 0.5     0.5 |      | 0.7  |      |      | vinsertf128 ymm6, ymm6, xmmword ptr [r11+r15*8], 0x1
 | 
			
		||||
|   2      |             |      | 0.5     0.5 | 0.5     0.5 |      | 1.0  |      |      | vmovhpd xmm2, xmm0, qword ptr [r11+r15*8+0x10]
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vunpcklpd ymm14, ymm1, ymm6
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vunpckhpd ymm1, ymm1, ymm6
 | 
			
		||||
|   1      | 0.7         | 0.3  |             |             |      |      |      |      | vsubpd ymm6, ymm10, ymm14
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vinsertf128 ymm7, ymm15, xmm2, 0x1
 | 
			
		||||
|   1      | 0.3         | 0.7  |             |             |      |      |      |      | vsubpd ymm2, ymm9, ymm1
 | 
			
		||||
|   1      | 0.7         | 0.3  |             |             |      |      |      |      | vsubpd ymm0, ymm8, ymm7
 | 
			
		||||
|   1      | 0.3         | 0.7  |             |             |      |      |      |      | vmulpd ymm14, ymm2, ymm2
 | 
			
		||||
|   1      | 0.7         | 0.3  |             |             |      |      |      |      | vfmadd231pd ymm14, ymm6, ymm6
 | 
			
		||||
|   1      | 0.3         | 0.7  |             |             |      |      |      |      | vfmadd231pd ymm14, ymm0, ymm0
 | 
			
		||||
|   1      | 0.7         | 0.3  |             |             |      |      |      |      | vcmppd ymm1, ymm14, ymm5, 0x1
 | 
			
		||||
|   1      | 0.3         | 0.7  |             |             |      |      |      |      | vpcmpeqd ymm7, ymm7, ymm7
 | 
			
		||||
|   2      | 1.0         |      |             |             |      | 1.0  |      |      | vptest ymm1, ymm7
 | 
			
		||||
|   1      | 1.0     8.0 |      |             |             |      |      |      |      | vdivpd ymm7, ymm4, ymm14
 | 
			
		||||
|   2^     |             | 1.0  | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmulpd ymm14, ymm7, ymmword ptr [rsp+0x60]
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | vmulpd ymm14, ymm7, ymm14
 | 
			
		||||
|   1      | 0.7         | 0.3  |             |             |      |      |      |      | vmulpd ymm15, ymm7, ymm14
 | 
			
		||||
|   1      | 0.3         | 0.7  |             |             |      |      |      |      | vfmsub213pd ymm14, ymm7, ymm3
 | 
			
		||||
|   2^     | 0.7         | 0.3  | 0.5     0.5 | 0.5     0.5 |      |      |      |      | vmulpd ymm7, ymm7, ymmword ptr [rsp+0x40]
 | 
			
		||||
|   1      | 0.3         | 0.7  |             |             |      |      |      |      | vmulpd ymm15, ymm15, ymm7
 | 
			
		||||
|   1      | 0.7         | 0.3  |             |             |      |      |      |      | vmulpd ymm7, ymm15, ymm14
 | 
			
		||||
|   1      | 0.3         | 0.7  |             |             |      |      |      |      | vmulpd ymm6, ymm6, ymm7
 | 
			
		||||
|   1      | 0.7         | 0.3  |             |             |      |      |      |      | vmulpd ymm2, ymm2, ymm7
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vandpd ymm6, ymm1, ymm6
 | 
			
		||||
|   1      | 0.3         | 0.7  |             |             |      |      |      |      | vaddpd ymm13, ymm13, ymm6
 | 
			
		||||
|   1      | 0.7         | 0.3  |             |             |      |      |      |      | vmulpd ymm6, ymm0, ymm7
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vandpd ymm0, ymm1, ymm2
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vandpd ymm1, ymm1, ymm6
 | 
			
		||||
|   1      | 0.3         | 0.7  |             |             |      |      |      |      | vaddpd ymm12, ymm12, ymm0
 | 
			
		||||
|   1      | 0.7         | 0.3  |             |             |      |      |      |      | vaddpd ymm11, ymm11, ymm1
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | add rdx, 0x4
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | cmp rdx, rsi
 | 
			
		||||
|   0*F    |             |      |             |             |      |      |      |      | jb 0xffffffffffffff02
 | 
			
		||||
Total Num Of Uops: 62
 | 
			
		||||
Analysis Notes:
 | 
			
		||||
Backend allocation was stalled due to unavailable allocation resources.
 | 
			
		||||
@@ -1,156 +0,0 @@
 | 
			
		||||
 | 
			
		||||
[0] Code Region
 | 
			
		||||
 | 
			
		||||
Iterations:        100
 | 
			
		||||
Instructions:      5600
 | 
			
		||||
Total Cycles:      2352
 | 
			
		||||
Total uOps:        6300
 | 
			
		||||
 | 
			
		||||
Dispatch Width:    6
 | 
			
		||||
uOps Per Cycle:    2.68
 | 
			
		||||
IPC:               2.38
 | 
			
		||||
Block RThroughput: 10.5
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Instruction Info:
 | 
			
		||||
[1]: #uOps
 | 
			
		||||
[2]: Latency
 | 
			
		||||
[3]: RThroughput
 | 
			
		||||
[4]: MayLoad
 | 
			
		||||
[5]: MayStore
 | 
			
		||||
[6]: HasSideEffects (U)
 | 
			
		||||
 | 
			
		||||
[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 | 
			
		||||
 1      6     0.50    *                   vmovdqu	(%rbx,%rdx,4), %xmm0
 | 
			
		||||
 1      2     1.00                        vmovq	%xmm0, %rcx
 | 
			
		||||
 1      1     1.00                        vpunpckhqdq	%xmm0, %xmm0, %xmm2
 | 
			
		||||
 1      2     1.00                        vmovq	%xmm2, %r15
 | 
			
		||||
 1      1     0.25                        movl	%ecx, %r8d
 | 
			
		||||
 1      1     0.50                        shrq	$32, %rcx
 | 
			
		||||
 1      1     0.50                        leal	(%rcx,%rcx,2), %r14d
 | 
			
		||||
 1      1     0.50                        leal	(%r8,%r8,2), %r8d
 | 
			
		||||
 1      1     0.25                        movslq	%r8d, %rcx
 | 
			
		||||
 1      1     0.25                        movslq	%r14d, %r8
 | 
			
		||||
 1      1     0.25                        movl	%r15d, %r14d
 | 
			
		||||
 1      1     0.50                        shrq	$32, %r15
 | 
			
		||||
 1      6     0.50    *                   vmovups	(%r11,%rcx,8), %xmm7
 | 
			
		||||
 1      6     0.50    *                   vmovups	(%r11,%r8,8), %xmm6
 | 
			
		||||
 1      5     0.50    *                   vmovq	16(%r11,%rcx,8), %xmm14
 | 
			
		||||
 1      1     0.50                        leal	(%r14,%r14,2), %r14d
 | 
			
		||||
 1      1     0.25                        movslq	%r14d, %r14
 | 
			
		||||
 1      1     0.50                        leal	(%r15,%r15,2), %r15d
 | 
			
		||||
 1      1     0.25                        movslq	%r15d, %r15
 | 
			
		||||
 2      6     1.00    *                   vmovhpd	16(%r11,%r8,8), %xmm14, %xmm15
 | 
			
		||||
 2      7     0.50    *                   vinsertf128	$1, (%r11,%r14,8), %ymm7, %ymm1
 | 
			
		||||
 1      5     0.50    *                   vmovq	16(%r11,%r14,8), %xmm0
 | 
			
		||||
 2      7     0.50    *                   vinsertf128	$1, (%r11,%r15,8), %ymm6, %ymm6
 | 
			
		||||
 2      6     1.00    *                   vmovhpd	16(%r11,%r15,8), %xmm0, %xmm2
 | 
			
		||||
 1      1     1.00                        vunpcklpd	%ymm6, %ymm1, %ymm14
 | 
			
		||||
 1      1     1.00                        vunpckhpd	%ymm6, %ymm1, %ymm1
 | 
			
		||||
 1      4     0.50                        vsubpd	%ymm14, %ymm10, %ymm6
 | 
			
		||||
 1      3     1.00                        vinsertf128	$1, %xmm2, %ymm15, %ymm7
 | 
			
		||||
 1      4     0.50                        vsubpd	%ymm1, %ymm9, %ymm2
 | 
			
		||||
 1      4     0.50                        vsubpd	%ymm7, %ymm8, %ymm0
 | 
			
		||||
 1      4     0.50                        vmulpd	%ymm2, %ymm2, %ymm14
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%ymm6, %ymm6, %ymm14
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%ymm0, %ymm0, %ymm14
 | 
			
		||||
 1      4     0.50                        vcmpltpd	%ymm5, %ymm14, %ymm1
 | 
			
		||||
 1      1     0.50                        vpcmpeqd	%ymm7, %ymm7, %ymm7
 | 
			
		||||
 2      3     1.00                        vptest	%ymm7, %ymm1
 | 
			
		||||
 1      14    5.00                        vdivpd	%ymm14, %ymm4, %ymm7
 | 
			
		||||
 2      11    0.50    *                   vmulpd	96(%rsp), %ymm7, %ymm14
 | 
			
		||||
 1      4     0.50                        vmulpd	%ymm14, %ymm7, %ymm14
 | 
			
		||||
 1      4     0.50                        vmulpd	%ymm14, %ymm7, %ymm15
 | 
			
		||||
 1      4     0.50                        vfmsub213pd	%ymm3, %ymm7, %ymm14
 | 
			
		||||
 2      11    0.50    *                   vmulpd	64(%rsp), %ymm7, %ymm7
 | 
			
		||||
 1      4     0.50                        vmulpd	%ymm7, %ymm15, %ymm15
 | 
			
		||||
 1      4     0.50                        vmulpd	%ymm14, %ymm15, %ymm7
 | 
			
		||||
 1      4     0.50                        vmulpd	%ymm7, %ymm6, %ymm6
 | 
			
		||||
 1      4     0.50                        vmulpd	%ymm7, %ymm2, %ymm2
 | 
			
		||||
 1      1     0.33                        vandpd	%ymm6, %ymm1, %ymm6
 | 
			
		||||
 1      4     0.50                        vaddpd	%ymm6, %ymm13, %ymm13
 | 
			
		||||
 1      4     0.50                        vmulpd	%ymm7, %ymm0, %ymm6
 | 
			
		||||
 1      1     0.33                        vandpd	%ymm2, %ymm1, %ymm0
 | 
			
		||||
 1      1     0.33                        vandpd	%ymm6, %ymm1, %ymm1
 | 
			
		||||
 1      4     0.50                        vaddpd	%ymm0, %ymm12, %ymm12
 | 
			
		||||
 1      4     0.50                        vaddpd	%ymm1, %ymm11, %ymm11
 | 
			
		||||
 1      1     0.25                        addq	$4, %rdx
 | 
			
		||||
 1      1     0.25                        cmpq	%rsi, %rdx
 | 
			
		||||
 1      1     0.50                        jb	..B1.22
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Resources:
 | 
			
		||||
[0]   - SKXDivider
 | 
			
		||||
[1]   - SKXFPDivider
 | 
			
		||||
[2]   - SKXPort0
 | 
			
		||||
[3]   - SKXPort1
 | 
			
		||||
[4]   - SKXPort2
 | 
			
		||||
[5]   - SKXPort3
 | 
			
		||||
[6]   - SKXPort4
 | 
			
		||||
[7]   - SKXPort5
 | 
			
		||||
[8]   - SKXPort6
 | 
			
		||||
[9]   - SKXPort7
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Resource pressure per iteration:
 | 
			
		||||
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    
 | 
			
		||||
 -     5.00   16.00  14.12  5.50   5.50    -     13.47  8.41    -     
 | 
			
		||||
 | 
			
		||||
Resource pressure by instruction:
 | 
			
		||||
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
 | 
			
		||||
 -      -      -      -     0.50   0.50    -      -      -      -     vmovdqu	(%rbx,%rdx,4), %xmm0
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmovq	%xmm0, %rcx
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vpunpckhqdq	%xmm0, %xmm0, %xmm2
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     vmovq	%xmm2, %r15
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -     movl	%ecx, %r8d
 | 
			
		||||
 -      -     0.06    -      -      -      -      -     0.94    -     shrq	$32, %rcx
 | 
			
		||||
 -      -      -     0.02    -      -      -     0.98    -      -     leal	(%rcx,%rcx,2), %r14d
 | 
			
		||||
 -      -      -     0.02    -      -      -     0.98    -      -     leal	(%r8,%r8,2), %r8d
 | 
			
		||||
 -      -     0.47   0.02    -      -      -      -     0.51    -     movslq	%r8d, %rcx
 | 
			
		||||
 -      -     0.46   0.02    -      -      -     0.01   0.51    -     movslq	%r14d, %r8
 | 
			
		||||
 -      -     0.03   0.01    -      -      -     0.45   0.51    -     movl	%r15d, %r14d
 | 
			
		||||
 -      -     0.51    -      -      -      -      -     0.49    -     shrq	$32, %r15
 | 
			
		||||
 -      -      -      -     0.49   0.51    -      -      -      -     vmovups	(%r11,%rcx,8), %xmm7
 | 
			
		||||
 -      -      -      -     0.49   0.51    -      -      -      -     vmovups	(%r11,%r8,8), %xmm6
 | 
			
		||||
 -      -      -      -     0.52   0.48    -      -      -      -     vmovq	16(%r11,%rcx,8), %xmm14
 | 
			
		||||
 -      -      -     0.02    -      -      -     0.98    -      -     leal	(%r14,%r14,2), %r14d
 | 
			
		||||
 -      -     0.01   0.01    -      -      -     0.01   0.97    -     movslq	%r14d, %r14
 | 
			
		||||
 -      -      -     0.03    -      -      -     0.97    -      -     leal	(%r15,%r15,2), %r15d
 | 
			
		||||
 -      -     0.04    -      -      -      -      -     0.96    -     movslq	%r15d, %r15
 | 
			
		||||
 -      -      -      -     0.07   0.93    -     1.00    -      -     vmovhpd	16(%r11,%r8,8), %xmm14, %xmm15
 | 
			
		||||
 -      -     0.03   0.46   0.49   0.51    -     0.51    -      -     vinsertf128	$1, (%r11,%r14,8), %ymm7, %ymm1
 | 
			
		||||
 -      -      -      -     0.51   0.49    -      -      -      -     vmovq	16(%r11,%r14,8), %xmm0
 | 
			
		||||
 -      -     0.47   0.02   0.93   0.07    -     0.51    -      -     vinsertf128	$1, (%r11,%r15,8), %ymm6, %ymm6
 | 
			
		||||
 -      -      -      -     0.50   0.50    -     1.00    -      -     vmovhpd	16(%r11,%r15,8), %xmm0, %xmm2
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vunpcklpd	%ymm6, %ymm1, %ymm14
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vunpckhpd	%ymm6, %ymm1, %ymm1
 | 
			
		||||
 -      -     0.01   0.99    -      -      -      -      -      -     vsubpd	%ymm14, %ymm10, %ymm6
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vinsertf128	$1, %xmm2, %ymm15, %ymm7
 | 
			
		||||
 -      -     0.96   0.04    -      -      -      -      -      -     vsubpd	%ymm1, %ymm9, %ymm2
 | 
			
		||||
 -      -     0.49   0.51    -      -      -      -      -      -     vsubpd	%ymm7, %ymm8, %ymm0
 | 
			
		||||
 -      -     0.48   0.52    -      -      -      -      -      -     vmulpd	%ymm2, %ymm2, %ymm14
 | 
			
		||||
 -      -     0.03   0.97    -      -      -      -      -      -     vfmadd231pd	%ymm6, %ymm6, %ymm14
 | 
			
		||||
 -      -     0.94   0.06    -      -      -      -      -      -     vfmadd231pd	%ymm0, %ymm0, %ymm14
 | 
			
		||||
 -      -     0.47   0.53    -      -      -      -      -      -     vcmpltpd	%ymm5, %ymm14, %ymm1
 | 
			
		||||
 -      -     0.96   0.04    -      -      -      -      -      -     vpcmpeqd	%ymm7, %ymm7, %ymm7
 | 
			
		||||
 -      -     1.00    -      -      -      -     1.00    -      -     vptest	%ymm7, %ymm1
 | 
			
		||||
 -     5.00   1.00    -      -      -      -      -      -      -     vdivpd	%ymm14, %ymm4, %ymm7
 | 
			
		||||
 -      -     0.93   0.07   0.49   0.51    -      -      -      -     vmulpd	96(%rsp), %ymm7, %ymm14
 | 
			
		||||
 -      -     0.05   0.95    -      -      -      -      -      -     vmulpd	%ymm14, %ymm7, %ymm14
 | 
			
		||||
 -      -     0.02   0.98    -      -      -      -      -      -     vmulpd	%ymm14, %ymm7, %ymm15
 | 
			
		||||
 -      -     0.98   0.02    -      -      -      -      -      -     vfmsub213pd	%ymm3, %ymm7, %ymm14
 | 
			
		||||
 -      -     0.07   0.93   0.51   0.49    -      -      -      -     vmulpd	64(%rsp), %ymm7, %ymm7
 | 
			
		||||
 -      -     0.01   0.99    -      -      -      -      -      -     vmulpd	%ymm7, %ymm15, %ymm15
 | 
			
		||||
 -      -     0.01   0.99    -      -      -      -      -      -     vmulpd	%ymm14, %ymm15, %ymm7
 | 
			
		||||
 -      -     0.03   0.97    -      -      -      -      -      -     vmulpd	%ymm7, %ymm6, %ymm6
 | 
			
		||||
 -      -     0.97   0.03    -      -      -      -      -      -     vmulpd	%ymm7, %ymm2, %ymm2
 | 
			
		||||
 -      -     0.03   0.90    -      -      -     0.07    -      -     vandpd	%ymm6, %ymm1, %ymm6
 | 
			
		||||
 -      -     0.06   0.94    -      -      -      -      -      -     vaddpd	%ymm6, %ymm13, %ymm13
 | 
			
		||||
 -      -     0.03   0.97    -      -      -      -      -      -     vmulpd	%ymm7, %ymm0, %ymm6
 | 
			
		||||
 -      -     0.46   0.08    -      -      -     0.46    -      -     vandpd	%ymm2, %ymm1, %ymm0
 | 
			
		||||
 -      -     0.47   0.01    -      -      -     0.52    -      -     vandpd	%ymm6, %ymm1, %ymm1
 | 
			
		||||
 -      -     0.48   0.52    -      -      -      -      -      -     vaddpd	%ymm0, %ymm12, %ymm12
 | 
			
		||||
 -      -     0.52   0.48    -      -      -      -      -      -     vaddpd	%ymm1, %ymm11, %ymm11
 | 
			
		||||
 -      -     0.01    -      -      -      -      -     0.99    -     addq	$4, %rdx
 | 
			
		||||
 -      -      -      -      -      -      -     0.02   0.98    -     cmpq	%rsi, %rdx
 | 
			
		||||
 -      -     0.45    -      -      -      -      -     0.55    -     jb	..B1.22
 | 
			
		||||
@@ -1,158 +0,0 @@
 | 
			
		||||
 | 
			
		||||
[0] Code Region
 | 
			
		||||
 | 
			
		||||
Iterations:        100
 | 
			
		||||
Instructions:      5600
 | 
			
		||||
Total Cycles:      2306
 | 
			
		||||
Total uOps:        6300
 | 
			
		||||
 | 
			
		||||
Dispatch Width:    6
 | 
			
		||||
uOps Per Cycle:    2.73
 | 
			
		||||
IPC:               2.43
 | 
			
		||||
Block RThroughput: 10.5
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Instruction Info:
 | 
			
		||||
[1]: #uOps
 | 
			
		||||
[2]: Latency
 | 
			
		||||
[3]: RThroughput
 | 
			
		||||
[4]: MayLoad
 | 
			
		||||
[5]: MayStore
 | 
			
		||||
[6]: HasSideEffects (U)
 | 
			
		||||
 | 
			
		||||
[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 | 
			
		||||
 1      6     0.50    *                   vmovdqu	(%rbx,%rdx,4), %xmm0
 | 
			
		||||
 1      2     1.00                        vmovq	%xmm0, %rcx
 | 
			
		||||
 1      1     0.50                        vpunpckhqdq	%xmm0, %xmm0, %xmm2
 | 
			
		||||
 1      2     1.00                        vmovq	%xmm2, %r15
 | 
			
		||||
 1      1     0.25                        movl	%ecx, %r8d
 | 
			
		||||
 1      1     0.50                        shrq	$32, %rcx
 | 
			
		||||
 1      1     0.50                        leal	(%rcx,%rcx,2), %r14d
 | 
			
		||||
 1      1     0.50                        leal	(%r8,%r8,2), %r8d
 | 
			
		||||
 1      1     0.25                        movslq	%r8d, %rcx
 | 
			
		||||
 1      1     0.25                        movslq	%r14d, %r8
 | 
			
		||||
 1      1     0.25                        movl	%r15d, %r14d
 | 
			
		||||
 1      1     0.50                        shrq	$32, %r15
 | 
			
		||||
 1      6     0.50    *                   vmovups	(%r11,%rcx,8), %xmm7
 | 
			
		||||
 1      6     0.50    *                   vmovups	(%r11,%r8,8), %xmm6
 | 
			
		||||
 1      5     0.50    *                   vmovq	16(%r11,%rcx,8), %xmm14
 | 
			
		||||
 1      1     0.50                        leal	(%r14,%r14,2), %r14d
 | 
			
		||||
 1      1     0.25                        movslq	%r14d, %r14
 | 
			
		||||
 1      1     0.50                        leal	(%r15,%r15,2), %r15d
 | 
			
		||||
 1      1     0.25                        movslq	%r15d, %r15
 | 
			
		||||
 2      6     1.00    *                   vmovhpd	16(%r11,%r8,8), %xmm14, %xmm15
 | 
			
		||||
 2      7     0.50    *                   vinsertf128	$1, (%r11,%r14,8), %ymm7, %ymm1
 | 
			
		||||
 1      5     0.50    *                   vmovq	16(%r11,%r14,8), %xmm0
 | 
			
		||||
 2      7     0.50    *                   vinsertf128	$1, (%r11,%r15,8), %ymm6, %ymm6
 | 
			
		||||
 2      6     1.00    *                   vmovhpd	16(%r11,%r15,8), %xmm0, %xmm2
 | 
			
		||||
 1      1     1.00                        vunpcklpd	%ymm6, %ymm1, %ymm14
 | 
			
		||||
 1      1     1.00                        vunpckhpd	%ymm6, %ymm1, %ymm1
 | 
			
		||||
 1      4     0.50                        vsubpd	%ymm14, %ymm10, %ymm6
 | 
			
		||||
 1      3     1.00                        vinsertf128	$1, %xmm2, %ymm15, %ymm7
 | 
			
		||||
 1      4     0.50                        vsubpd	%ymm1, %ymm9, %ymm2
 | 
			
		||||
 1      4     0.50                        vsubpd	%ymm7, %ymm8, %ymm0
 | 
			
		||||
 1      4     0.50                        vmulpd	%ymm2, %ymm2, %ymm14
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%ymm6, %ymm6, %ymm14
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%ymm0, %ymm0, %ymm14
 | 
			
		||||
 1      4     0.50                        vcmpltpd	%ymm5, %ymm14, %ymm1
 | 
			
		||||
 1      1     0.50                        vpcmpeqd	%ymm7, %ymm7, %ymm7
 | 
			
		||||
 2      3     1.00                        vptest	%ymm7, %ymm1
 | 
			
		||||
 1      14    5.00                        vdivpd	%ymm14, %ymm4, %ymm7
 | 
			
		||||
 2      11    0.50    *                   vmulpd	96(%rsp), %ymm7, %ymm14
 | 
			
		||||
 1      4     0.50                        vmulpd	%ymm14, %ymm7, %ymm14
 | 
			
		||||
 1      4     0.50                        vmulpd	%ymm14, %ymm7, %ymm15
 | 
			
		||||
 1      4     0.50                        vfmsub213pd	%ymm3, %ymm7, %ymm14
 | 
			
		||||
 2      11    0.50    *                   vmulpd	64(%rsp), %ymm7, %ymm7
 | 
			
		||||
 1      4     0.50                        vmulpd	%ymm7, %ymm15, %ymm15
 | 
			
		||||
 1      4     0.50                        vmulpd	%ymm14, %ymm15, %ymm7
 | 
			
		||||
 1      4     0.50                        vmulpd	%ymm7, %ymm6, %ymm6
 | 
			
		||||
 1      4     0.50                        vmulpd	%ymm7, %ymm2, %ymm2
 | 
			
		||||
 1      1     0.33                        vandpd	%ymm6, %ymm1, %ymm6
 | 
			
		||||
 1      4     0.50                        vaddpd	%ymm6, %ymm13, %ymm13
 | 
			
		||||
 1      4     0.50                        vmulpd	%ymm7, %ymm0, %ymm6
 | 
			
		||||
 1      1     0.33                        vandpd	%ymm2, %ymm1, %ymm0
 | 
			
		||||
 1      1     0.33                        vandpd	%ymm6, %ymm1, %ymm1
 | 
			
		||||
 1      4     0.50                        vaddpd	%ymm0, %ymm12, %ymm12
 | 
			
		||||
 1      4     0.50                        vaddpd	%ymm1, %ymm11, %ymm11
 | 
			
		||||
 1      1     0.25                        addq	$4, %rdx
 | 
			
		||||
 1      1     0.25                        cmpq	%rsi, %rdx
 | 
			
		||||
 1      1     0.50                        jb	..B1.22
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Resources:
 | 
			
		||||
[0]   - ICXDivider
 | 
			
		||||
[1]   - ICXFPDivider
 | 
			
		||||
[2]   - ICXPort0
 | 
			
		||||
[3]   - ICXPort1
 | 
			
		||||
[4]   - ICXPort2
 | 
			
		||||
[5]   - ICXPort3
 | 
			
		||||
[6]   - ICXPort4
 | 
			
		||||
[7]   - ICXPort5
 | 
			
		||||
[8]   - ICXPort6
 | 
			
		||||
[9]   - ICXPort7
 | 
			
		||||
[10]  - ICXPort8
 | 
			
		||||
[11]  - ICXPort9
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Resource pressure per iteration:
 | 
			
		||||
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   
 | 
			
		||||
 -     5.00   15.12  15.03  5.50   5.50    -     13.45  8.40    -      -      -     
 | 
			
		||||
 | 
			
		||||
Resource pressure by instruction:
 | 
			
		||||
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
 | 
			
		||||
 -      -      -      -     0.50   0.50    -      -      -      -      -      -     vmovdqu	(%rbx,%rdx,4), %xmm0
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -      -      -     vmovq	%xmm0, %rcx
 | 
			
		||||
 -      -      -     0.46    -      -      -     0.54    -      -      -      -     vpunpckhqdq	%xmm0, %xmm0, %xmm2
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -      -      -     vmovq	%xmm2, %r15
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -      -      -     movl	%ecx, %r8d
 | 
			
		||||
 -      -     0.96    -      -      -      -      -     0.04    -      -      -     shrq	$32, %rcx
 | 
			
		||||
 -      -      -     0.01    -      -      -     0.99    -      -      -      -     leal	(%rcx,%rcx,2), %r14d
 | 
			
		||||
 -      -      -     0.03    -      -      -     0.97    -      -      -      -     leal	(%r8,%r8,2), %r8d
 | 
			
		||||
 -      -     0.48   0.01    -      -      -      -     0.51    -      -      -     movslq	%r8d, %rcx
 | 
			
		||||
 -      -     0.02   0.02    -      -      -     0.01   0.95    -      -      -     movslq	%r14d, %r8
 | 
			
		||||
 -      -     0.02    -      -      -      -      -     0.98    -      -      -     movl	%r15d, %r14d
 | 
			
		||||
 -      -     0.52    -      -      -      -      -     0.48    -      -      -     shrq	$32, %r15
 | 
			
		||||
 -      -      -      -     0.49   0.51    -      -      -      -      -      -     vmovups	(%r11,%rcx,8), %xmm7
 | 
			
		||||
 -      -      -      -     0.49   0.51    -      -      -      -      -      -     vmovups	(%r11,%r8,8), %xmm6
 | 
			
		||||
 -      -      -      -     0.52   0.48    -      -      -      -      -      -     vmovq	16(%r11,%rcx,8), %xmm14
 | 
			
		||||
 -      -      -     0.47    -      -      -     0.53    -      -      -      -     leal	(%r14,%r14,2), %r14d
 | 
			
		||||
 -      -     0.01   0.01    -      -      -     0.01   0.97    -      -      -     movslq	%r14d, %r14
 | 
			
		||||
 -      -      -     0.04    -      -      -     0.96    -      -      -      -     leal	(%r15,%r15,2), %r15d
 | 
			
		||||
 -      -     0.48    -      -      -      -     0.01   0.51    -      -      -     movslq	%r15d, %r15
 | 
			
		||||
 -      -      -      -     0.51   0.49    -     1.00    -      -      -      -     vmovhpd	16(%r11,%r8,8), %xmm14, %xmm15
 | 
			
		||||
 -      -     0.02   0.01   0.95   0.05    -     0.97    -      -      -      -     vinsertf128	$1, (%r11,%r14,8), %ymm7, %ymm1
 | 
			
		||||
 -      -      -      -     0.05   0.95    -      -      -      -      -      -     vmovq	16(%r11,%r14,8), %xmm0
 | 
			
		||||
 -      -     0.02   0.49   0.49   0.51    -     0.49    -      -      -      -     vinsertf128	$1, (%r11,%r15,8), %ymm6, %ymm6
 | 
			
		||||
 -      -      -      -     0.50   0.50    -     1.00    -      -      -      -     vmovhpd	16(%r11,%r15,8), %xmm0, %xmm2
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -      -      -     vunpcklpd	%ymm6, %ymm1, %ymm14
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -      -      -     vunpckhpd	%ymm6, %ymm1, %ymm1
 | 
			
		||||
 -      -     0.47   0.53    -      -      -      -      -      -      -      -     vsubpd	%ymm14, %ymm10, %ymm6
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -      -      -     vinsertf128	$1, %xmm2, %ymm15, %ymm7
 | 
			
		||||
 -      -     0.50   0.50    -      -      -      -      -      -      -      -     vsubpd	%ymm1, %ymm9, %ymm2
 | 
			
		||||
 -      -     0.94   0.06    -      -      -      -      -      -      -      -     vsubpd	%ymm7, %ymm8, %ymm0
 | 
			
		||||
 -      -     0.06   0.94    -      -      -      -      -      -      -      -     vmulpd	%ymm2, %ymm2, %ymm14
 | 
			
		||||
 -      -     0.04   0.96    -      -      -      -      -      -      -      -     vfmadd231pd	%ymm6, %ymm6, %ymm14
 | 
			
		||||
 -      -     0.95   0.05    -      -      -      -      -      -      -      -     vfmadd231pd	%ymm0, %ymm0, %ymm14
 | 
			
		||||
 -      -     0.02   0.98    -      -      -      -      -      -      -      -     vcmpltpd	%ymm5, %ymm14, %ymm1
 | 
			
		||||
 -      -     0.05   0.95    -      -      -      -      -      -      -      -     vpcmpeqd	%ymm7, %ymm7, %ymm7
 | 
			
		||||
 -      -     1.00    -      -      -      -     1.00    -      -      -      -     vptest	%ymm7, %ymm1
 | 
			
		||||
 -     5.00   1.00    -      -      -      -      -      -      -      -      -     vdivpd	%ymm14, %ymm4, %ymm7
 | 
			
		||||
 -      -     0.51   0.49   0.49   0.51    -      -      -      -      -      -     vmulpd	96(%rsp), %ymm7, %ymm14
 | 
			
		||||
 -      -     0.04   0.96    -      -      -      -      -      -      -      -     vmulpd	%ymm14, %ymm7, %ymm14
 | 
			
		||||
 -      -     0.01   0.99    -      -      -      -      -      -      -      -     vmulpd	%ymm14, %ymm7, %ymm15
 | 
			
		||||
 -      -     0.99   0.01    -      -      -      -      -      -      -      -     vfmsub213pd	%ymm3, %ymm7, %ymm14
 | 
			
		||||
 -      -     0.49   0.51   0.51   0.49    -      -      -      -      -      -     vmulpd	64(%rsp), %ymm7, %ymm7
 | 
			
		||||
 -      -     0.01   0.99    -      -      -      -      -      -      -      -     vmulpd	%ymm7, %ymm15, %ymm15
 | 
			
		||||
 -      -     0.01   0.99    -      -      -      -      -      -      -      -     vmulpd	%ymm14, %ymm15, %ymm7
 | 
			
		||||
 -      -     0.48   0.52    -      -      -      -      -      -      -      -     vmulpd	%ymm7, %ymm6, %ymm6
 | 
			
		||||
 -      -     0.52   0.48    -      -      -      -      -      -      -      -     vmulpd	%ymm7, %ymm2, %ymm2
 | 
			
		||||
 -      -     0.46   0.02    -      -      -     0.52    -      -      -      -     vandpd	%ymm6, %ymm1, %ymm6
 | 
			
		||||
 -      -     0.49   0.51    -      -      -      -      -      -      -      -     vaddpd	%ymm6, %ymm13, %ymm13
 | 
			
		||||
 -      -     0.48   0.52    -      -      -      -      -      -      -      -     vmulpd	%ymm7, %ymm0, %ymm6
 | 
			
		||||
 -      -     0.02   0.52    -      -      -     0.46    -      -      -      -     vandpd	%ymm2, %ymm1, %ymm0
 | 
			
		||||
 -      -     0.02    -      -      -      -     0.98    -      -      -      -     vandpd	%ymm6, %ymm1, %ymm1
 | 
			
		||||
 -      -     0.49   0.51    -      -      -      -      -      -      -      -     vaddpd	%ymm0, %ymm12, %ymm12
 | 
			
		||||
 -      -     0.51   0.49    -      -      -      -      -      -      -      -     vaddpd	%ymm1, %ymm11, %ymm11
 | 
			
		||||
 -      -     0.01    -      -      -      -      -     0.99    -      -      -     addq	$4, %rdx
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.01   0.98    -      -      -     cmpq	%rsi, %rdx
 | 
			
		||||
 -      -     0.01    -      -      -      -      -     0.99    -      -      -     jb	..B1.22
 | 
			
		||||
@@ -1,97 +0,0 @@
 | 
			
		||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
 | 
			
		||||
Analyzed file:      lammps-icc-avx2.s
 | 
			
		||||
Architecture:       CSX
 | 
			
		||||
Timestamp:          2023-02-10 16:29:58
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
 | 
			
		||||
 * - Instruction micro-ops not bound to a port
 | 
			
		||||
 X - No throughput/latency information for this instruction in data file
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Combined Analysis Report
 | 
			
		||||
------------------------
 | 
			
		||||
                                       Port pressure in cycles                                       
 | 
			
		||||
     |  0   - 0DV  |   1   |  2   -  2D  |  3   -  3D  |  4   |   5   |  6   |  7   ||  CP  | LCD  |
 | 
			
		||||
----------------------------------------------------------------------------------------------------
 | 
			
		||||
 256 |             |       |             |             |      |       |      |      ||      |      |   # pointer_increment=32 724d27eafcb27eabca1528ddfdbdba3e
 | 
			
		||||
 257 |             |       |             |             |      |       |      |      ||      |      |   # LLVM-MCA-BEGIN
 | 
			
		||||
 258 |             |       |             |             |      |       |      |      ||      |      |   ..B1.22:                        # Preds ..B1.24 ..B1.21
 | 
			
		||||
 259 |             |       |             |             |      |       |      |      ||      |      |   # Execution count [2.50e+01]
 | 
			
		||||
 260 |             |       | 0.50   0.50 | 0.50   0.50 |      |       |      |      ||  4.0 |      |   vmovdqu   (%rbx,%rdx,4), %xmm0                          #60.21
 | 
			
		||||
 261 | 1.00        |       |             |             |      |       |      |      ||  1.0 |      |   vmovq     %xmm0, %rcx                                   #60.21
 | 
			
		||||
 262 |             |       |             |             |      | 1.000 |      |      ||      |      |   vpunpckhqdq %xmm0, %xmm0, %xmm2                         #60.21
 | 
			
		||||
 263 | 1.00        |       |             |             |      |       |      |      ||      |      |   vmovq     %xmm2, %r15                                   #60.21
 | 
			
		||||
 264 | 0.00        | 0.000 |             |             |      | 0.000 | 1.00 |      ||  1.0 |      |   movl      %ecx, %r8d                                    #60.21
 | 
			
		||||
 265 | 0.00        |       |             |             |      |       | 1.00 |      ||      |      |   shrq      $32, %rcx                                     #60.21
 | 
			
		||||
 266 |             | 0.500 |             |             |      | 0.500 |      |      ||      |      |   lea       (%rcx,%rcx,2), %r14d                          #61.36
 | 
			
		||||
 267 |             | 0.500 |             |             |      | 0.500 |      |      ||  1.0 |      |   lea       (%r8,%r8,2), %r8d                             #61.36
 | 
			
		||||
 268 | 0.00        | 0.000 |             |             |      | 0.000 | 1.00 |      ||  1.0 |      |   movslq    %r8d, %rcx                                    #61.36
 | 
			
		||||
 269 | 0.00        | 0.000 |             |             |      | 0.000 | 1.00 |      ||      |      |   movslq    %r14d, %r8                                    #61.36
 | 
			
		||||
 270 | 0.00        | 0.000 |             |             |      | 0.000 | 1.00 |      ||      |      |   movl      %r15d, %r14d                                  #60.21
 | 
			
		||||
 271 | 0.00        |       |             |             |      |       | 1.00 |      ||      |      |   shrq      $32, %r15                                     #60.21
 | 
			
		||||
 272 |             |       | 0.50   0.50 | 0.50   0.50 |      |       |      |      ||  4.0 |      |   vmovups   (%r11,%rcx,8), %xmm7                          #61.36
 | 
			
		||||
 273 |             |       | 0.50   0.50 | 0.50   0.50 |      |       |      |      ||      |      |   vmovups   (%r11,%r8,8), %xmm6                           #61.36
 | 
			
		||||
 274 |             |       | 0.50   0.50 | 0.50   0.50 |      |       |      |      ||      |      |   vmovq     16(%r11,%rcx,8), %xmm14                       #61.36
 | 
			
		||||
 275 |             | 0.500 |             |             |      | 0.500 |      |      ||      |      |   lea       (%r14,%r14,2), %r14d                          #61.36
 | 
			
		||||
 276 | 0.00        | 0.000 |             |             |      | 0.000 | 1.00 |      ||      |      |   movslq    %r14d, %r14                                   #61.36
 | 
			
		||||
 277 |             | 0.500 |             |             |      | 0.500 |      |      ||      |      |   lea       (%r15,%r15,2), %r15d                          #61.36
 | 
			
		||||
 278 | 0.00        | 0.000 |             |             |      | 0.000 | 1.00 |      ||      |      |   movslq    %r15d, %r15                                   #61.36
 | 
			
		||||
 279 |             |       | 0.50   0.50 | 0.50   0.50 |      | 1.000 |      |      ||      |      |   vmovhpd   16(%r11,%r8,8), %xmm14, %xmm15                #61.36
 | 
			
		||||
 280 |             |       | 0.50   0.50 | 0.50   0.50 |      | 1.000 |      |      ||  3.0 |      |   vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1             #61.36
 | 
			
		||||
 281 |             |       | 0.50   0.50 | 0.50   0.50 |      |       |      |      ||      |      |   vmovq     16(%r11,%r14,8), %xmm0                        #61.36
 | 
			
		||||
 282 |             |       | 0.50   0.50 | 0.50   0.50 |      | 1.000 |      |      ||      |      |   vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6             #61.36
 | 
			
		||||
 283 |             |       | 0.50   0.50 | 0.50   0.50 |      | 1.000 |      |      ||      |      |   vmovhpd   16(%r11,%r15,8), %xmm0, %xmm2                 #61.36
 | 
			
		||||
 284 |             |       |             |             |      | 1.000 |      |      ||      |      |   vunpcklpd %ymm6, %ymm1, %ymm14                          #61.36
 | 
			
		||||
 285 |             |       |             |             |      | 1.000 |      |      ||  1.0 |      |   vunpckhpd %ymm6, %ymm1, %ymm1                           #61.36
 | 
			
		||||
 286 | 0.50        | 0.500 |             |             |      |       |      |      ||      |      |   vsubpd    %ymm14, %ymm10, %ymm6                         #61.36
 | 
			
		||||
 287 |             |       |             |             |      | 1.000 |      |      ||      |      |   vinsertf128 $1, %xmm2, %ymm15, %ymm7                    #61.36
 | 
			
		||||
 288 | 0.50        | 0.500 |             |             |      |       |      |      ||  4.0 |      |   vsubpd    %ymm1, %ymm9, %ymm2                           #62.36
 | 
			
		||||
 289 | 0.50        | 0.500 |             |             |      |       |      |      ||      |      |   vsubpd    %ymm7, %ymm8, %ymm0                           #63.36
 | 
			
		||||
 290 | 0.50        | 0.500 |             |             |      |       |      |      ||  4.0 |      |   vmulpd    %ymm2, %ymm2, %ymm14                          #64.49
 | 
			
		||||
 291 | 0.50        | 0.500 |             |             |      |       |      |      ||  4.0 |      |   vfmadd231pd %ymm6, %ymm6, %ymm14                        #64.49
 | 
			
		||||
 292 | 0.50        | 0.500 |             |             |      |       |      |      ||  4.0 |      |   vfmadd231pd %ymm0, %ymm0, %ymm14                        #64.63
 | 
			
		||||
 293 |             |       |             |             |      | 1.000 |      |      ||      |      |   vcmpltpd  %ymm5, %ymm14, %ymm1                          #74.22
 | 
			
		||||
 294 | 0.50        | 0.500 |             |             |      |       |      |      ||      |      |   vpcmpeqd  %ymm7, %ymm7, %ymm7                           #74.22
 | 
			
		||||
 295 | 1.00        |       |             |             |      | 1.000 |      |      ||      |      |   vptest    %ymm7, %ymm1                                  #74.22
 | 
			
		||||
 296 |             |       |             |             |      |       |      |      ||      |      |   #je        ..B1.24       # Prob 50%                      #74.22
 | 
			
		||||
 297 |             |       |             |             |      |       |      |      ||      |      |   # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 ymm14
 | 
			
		||||
 298 |             |       |             |             |      |       |      |      ||      |      |   ..B1.23:                        # Preds ..B1.22
 | 
			
		||||
 299 |             |       |             |             |      |       |      |      ||      |      |   # Execution count [1.25e+01]
 | 
			
		||||
 300 | 1.00   8.00 |       |             |             |      |       |      |      || 15.0 |      |   vdivpd    %ymm14, %ymm4, %ymm7                          #75.39
 | 
			
		||||
 301 | 0.50        | 0.500 | 0.50   0.50 | 0.50   0.50 |      |       |      |      ||  4.0 |      |   vmulpd    96(%rsp), %ymm7, %ymm14                       #76.38[spill]
 | 
			
		||||
 302 | 0.50        | 0.500 |             |             |      |       |      |      ||  4.0 |      |   vmulpd    %ymm14, %ymm7, %ymm14                         #76.44
 | 
			
		||||
 303 | 0.50        | 0.500 |             |             |      |       |      |      ||  4.0 |      |   vmulpd    %ymm14, %ymm7, %ymm15                         #76.50
 | 
			
		||||
 304 | 0.50        | 0.500 |             |             |      |       |      |      ||      |      |   vfmsub213pd %ymm3, %ymm7, %ymm14                        #77.55
 | 
			
		||||
 305 | 0.50        | 0.500 | 0.50   0.50 | 0.50   0.50 |      |       |      |      ||      |      |   vmulpd    64(%rsp), %ymm7, %ymm7                        #77.55[spill]
 | 
			
		||||
 306 | 0.50        | 0.500 |             |             |      |       |      |      ||  4.0 |      |   vmulpd    %ymm7, %ymm15, %ymm15                         #77.64
 | 
			
		||||
 307 | 0.50        | 0.500 |             |             |      |       |      |      ||  4.0 |      |   vmulpd    %ymm14, %ymm15, %ymm7                         #77.70
 | 
			
		||||
 308 | 0.50        | 0.500 |             |             |      |       |      |      ||  4.0 |      |   vmulpd    %ymm7, %ymm6, %ymm6                           #78.31
 | 
			
		||||
 309 | 0.50        | 0.500 |             |             |      |       |      |      ||      |      |   vmulpd    %ymm7, %ymm2, %ymm2                           #79.31
 | 
			
		||||
 310 | 0.25        | 0.253 |             |             |      | 0.493 |      |      ||  1.0 |      |   vandpd    %ymm6, %ymm1, %ymm6                           #78.31
 | 
			
		||||
 311 | 0.50        | 0.500 |             |             |      |       |      |      ||  4.0 |      |   vaddpd    %ymm6, %ymm13, %ymm13                         #78.17
 | 
			
		||||
 312 | 0.25        | 0.750 |             |             |      |       |      |      ||      |      |   vmulpd    %ymm7, %ymm0, %ymm6                           #80.31
 | 
			
		||||
 313 | 0.16        | 0.417 |             |             |      | 0.423 |      |      ||      |      |   vandpd    %ymm2, %ymm1, %ymm0                           #79.31
 | 
			
		||||
 314 | 0.00        | 0.250 |             |             |      | 0.750 |      |      ||      |      |   vandpd    %ymm6, %ymm1, %ymm1                           #80.31
 | 
			
		||||
 315 | 0.00        | 1.000 |             |             |      |       |      |      ||      |      |   vaddpd    %ymm0, %ymm12, %ymm12                         #79.17
 | 
			
		||||
 316 | 0.50        | 0.500 |             |             |      |       |      |      ||      |  4.0 |   vaddpd    %ymm1, %ymm11, %ymm11                         #80.17
 | 
			
		||||
 317 |             |       |             |             |      |       |      |      ||      |      |   # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm3 ymm4 ymm5 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13
 | 
			
		||||
 318 |             |       |             |             |      |       |      |      ||      |      |   ..B1.24:                        # Preds ..B1.23 ..B1.22
 | 
			
		||||
 319 |             |       |             |             |      |       |      |      ||      |      |   # Execution count [2.50e+01]
 | 
			
		||||
 320 | 0.00        | 0.000 |             |             |      | -0.01 | 1.00 |      ||      |      |   addq      $4, %rdx                                      #59.9
 | 
			
		||||
 321 | 0.00        | -0.01 |             |             |      | 0.000 | 1.00 |      ||      |      |   cmpq      %rsi, %rdx                                    #59.9
 | 
			
		||||
 322 |             |       |             |             |      |       |      |      ||      |      | * jb        ..B1.22       # Prob 82%                      #59.9
 | 
			
		||||
 323 |             |       |             |             |      |       |      |      ||      |      |   # LLVM-MCA-END
 | 
			
		||||
 | 
			
		||||
       13.7   8.00   13.66   5.50   5.50   5.50   5.50          13.66   10.0           76.0    4.0  
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Loop-Carried Dependencies Analysis Report
 | 
			
		||||
-----------------------------------------
 | 
			
		||||
 316 |  4.0 | vaddpd    %ymm1, %ymm11, %ymm11                         #80.17| [316]
 | 
			
		||||
 315 |  4.0 | vaddpd    %ymm0, %ymm12, %ymm12                         #79.17| [315]
 | 
			
		||||
 311 |  4.0 | vaddpd    %ymm6, %ymm13, %ymm13                         #78.17| [311]
 | 
			
		||||
 320 |  1.0 | addq      $4, %rdx                                      #59.9| [320]
 | 
			
		||||
 | 
			
		||||
@@ -1,97 +0,0 @@
 | 
			
		||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
 | 
			
		||||
Analyzed file:      lammps-icc-avx2.s
 | 
			
		||||
Architecture:       ICX
 | 
			
		||||
Timestamp:          2023-02-10 16:29:48
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
 | 
			
		||||
 * - Instruction micro-ops not bound to a port
 | 
			
		||||
 X - No throughput/latency information for this instruction in data file
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Combined Analysis Report
 | 
			
		||||
------------------------
 | 
			
		||||
                                                Port pressure in cycles                                                 
 | 
			
		||||
     |  0   - 0DV  |  1   - 1DV  |  2   -  2D  |  3   -  3D  |  4   |  5   |  6   |  7   |  8   |  9   ||  CP  | LCD  |
 | 
			
		||||
-----------------------------------------------------------------------------------------------------------------------
 | 
			
		||||
 256 |             |             |             |             |      |      |      |      |      |      ||      |      |   # pointer_increment=32 724d27eafcb27eabca1528ddfdbdba3e
 | 
			
		||||
 257 |             |             |             |             |      |      |      |      |      |      ||      |      |   # LLVM-MCA-BEGIN
 | 
			
		||||
 258 |             |             |             |             |      |      |      |      |      |      ||      |      |   ..B1.22:                        # Preds ..B1.24 ..B1.21
 | 
			
		||||
 259 |             |             |             |             |      |      |      |      |      |      ||      |      |   # Execution count [2.50e+01]
 | 
			
		||||
 260 |             |             | 0.50   0.50 | 0.50   0.50 |      |      |      |      |      |      ||  5.0 |      |   vmovdqu   (%rbx,%rdx,4), %xmm0                          #60.21
 | 
			
		||||
 261 | 1.00        |             |             |             |      |      |      |      |      |      ||  1.0 |      |   vmovq     %xmm0, %rcx                                   #60.21
 | 
			
		||||
 262 |             | 0.50        |             |             |      | 0.50 |      |      |      |      ||      |      |   vpunpckhqdq %xmm0, %xmm0, %xmm2                         #60.21
 | 
			
		||||
 263 | 1.00        |             |             |             |      |      |      |      |      |      ||      |      |   vmovq     %xmm2, %r15                                   #60.21
 | 
			
		||||
 264 | 0.37        | 0.00        |             |             |      | 0.25 | 0.38 |      |      |      ||  1.0 |      |   movl      %ecx, %r8d                                    #60.21
 | 
			
		||||
 265 | 0.50        |             |             |             |      |      | 0.50 |      |      |      ||      |      |   shrq      $32, %rcx                                     #60.21
 | 
			
		||||
 266 | 0.13        | 0.00        |             |             |      | 0.00 | 0.87 |      |      |      ||      |      |   lea       (%rcx,%rcx,2), %r14d                          #61.36
 | 
			
		||||
 267 | 0.00        | 0.00        |             |             |      | 0.00 | 1.00 |      |      |      ||  6.0 |      |   lea       (%r8,%r8,2), %r8d                             #61.36
 | 
			
		||||
 268 | 0.00        | 0.00        |             |             |      | 0.00 | 1.00 |      |      |      ||  1.0 |      |   movslq    %r8d, %rcx                                    #61.36
 | 
			
		||||
 269 | 0.00        | 0.00        |             |             |      | 0.00 | 1.00 |      |      |      ||      |      |   movslq    %r14d, %r8                                    #61.36
 | 
			
		||||
 270 | 0.00        | 0.00        |             |             |      | 0.00 | 1.00 |      |      |      ||      |      |   movl      %r15d, %r14d                                  #60.21
 | 
			
		||||
 271 | 0.00        |             |             |             |      |      | 1.00 |      |      |      ||      |      |   shrq      $32, %r15                                     #60.21
 | 
			
		||||
 272 |             |             | 0.50   0.50 | 0.50   0.50 |      |      |      |      |      |      ||  5.0 |      |   vmovups   (%r11,%rcx,8), %xmm7                          #61.36
 | 
			
		||||
 273 |             |             | 0.50   0.50 | 0.50   0.50 |      |      |      |      |      |      ||      |      |   vmovups   (%r11,%r8,8), %xmm6                           #61.36
 | 
			
		||||
 274 |             |             | 0.50   0.50 | 0.50   0.50 |      |      |      |      |      |      ||      |      |   vmovq     16(%r11,%rcx,8), %xmm14                       #61.36
 | 
			
		||||
 275 | 0.00        | 0.00        |             |             |      | 0.00 | 1.00 |      |      |      ||      |      |   lea       (%r14,%r14,2), %r14d                          #61.36
 | 
			
		||||
 276 | 0.00        | 0.00        |             |             |      | 0.00 | 1.00 |      |      |      ||      |      |   movslq    %r14d, %r14                                   #61.36
 | 
			
		||||
 277 | 0.00        | 0.00        |             |             |      | 0.00 | 1.00 |      |      |      ||      |      |   lea       (%r15,%r15,2), %r15d                          #61.36
 | 
			
		||||
 278 | 0.00        | 0.00        |             |             |      | 0.00 | 1.00 |      |      |      ||      |      |   movslq    %r15d, %r15                                   #61.36
 | 
			
		||||
 279 |             |             | 0.50   0.50 | 0.50   0.50 |      | 1.00 |      |      |      |      ||      |      |   vmovhpd   16(%r11,%r8,8), %xmm14, %xmm15                #61.36
 | 
			
		||||
 280 |             |             | 0.50   0.50 | 0.50   0.50 |      | 1.00 |      |      |      |      ||  3.0 |      |   vinsertf128 $1, (%r11,%r14,8), %ymm7, %ymm1             #61.36
 | 
			
		||||
 281 |             |             | 0.50   0.50 | 0.50   0.50 |      |      |      |      |      |      ||      |      |   vmovq     16(%r11,%r14,8), %xmm0                        #61.36
 | 
			
		||||
 282 |             |             | 0.50   0.50 | 0.50   0.50 |      | 1.00 |      |      |      |      ||      |      |   vinsertf128 $1, (%r11,%r15,8), %ymm6, %ymm6             #61.36
 | 
			
		||||
 283 |             |             | 0.50   0.50 | 0.50   0.50 |      | 1.00 |      |      |      |      ||      |      |   vmovhpd   16(%r11,%r15,8), %xmm0, %xmm2                 #61.36
 | 
			
		||||
 284 |             |             |             |             |      | 1.00 |      |      |      |      ||      |      |   vunpcklpd %ymm6, %ymm1, %ymm14                          #61.36
 | 
			
		||||
 285 |             |             |             |             |      | 1.00 |      |      |      |      ||  1.0 |      |   vunpckhpd %ymm6, %ymm1, %ymm1                           #61.36
 | 
			
		||||
 286 | 0.50        | 0.50        |             |             |      |      |      |      |      |      ||      |      |   vsubpd    %ymm14, %ymm10, %ymm6                         #61.36
 | 
			
		||||
 287 |             |             |             |             |      | 1.00 |      |      |      |      ||      |      |   vinsertf128 $1, %xmm2, %ymm15, %ymm7                    #61.36
 | 
			
		||||
 288 | 0.50        | 0.50        |             |             |      |      |      |      |      |      ||  4.0 |      |   vsubpd    %ymm1, %ymm9, %ymm2                           #62.36
 | 
			
		||||
 289 | 0.50        | 0.50        |             |             |      |      |      |      |      |      ||      |      |   vsubpd    %ymm7, %ymm8, %ymm0                           #63.36
 | 
			
		||||
 290 | 0.50        | 0.50        |             |             |      |      |      |      |      |      ||  4.0 |      |   vmulpd    %ymm2, %ymm2, %ymm14                          #64.49
 | 
			
		||||
 291 | 0.50        | 0.50        |             |             |      |      |      |      |      |      ||  4.0 |      |   vfmadd231pd %ymm6, %ymm6, %ymm14                        #64.49
 | 
			
		||||
 292 | 0.75        | 0.25        |             |             |      |      |      |      |      |      ||  4.0 |      |   vfmadd231pd %ymm0, %ymm0, %ymm14                        #64.63
 | 
			
		||||
 293 | 0.00        |             |             |             |      | 1.00 |      |      |      |      ||      |      |   vcmpltpd  %ymm5, %ymm14, %ymm1                          #74.22
 | 
			
		||||
 294 | 0.50        | 0.50        |             |             |      |      |      |      |      |      ||      |      |   vpcmpeqd  %ymm7, %ymm7, %ymm7                           #74.22
 | 
			
		||||
 295 | 1.00        |             |             |             |      | 1.00 |      |      |      |      ||      |      |   vptest    %ymm7, %ymm1                                  #74.22
 | 
			
		||||
 296 |             |             |             |             |      |      |      |      |      |      ||      |      |   #je        ..B1.24       # Prob 50%                      #74.22
 | 
			
		||||
 297 |             |             |             |             |      |      |      |      |      |      ||      |      |   # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm6 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13 ymm14
 | 
			
		||||
 298 |             |             |             |             |      |      |      |      |      |      ||      |      |   ..B1.23:                        # Preds ..B1.22
 | 
			
		||||
 299 |             |             |             |             |      |      |      |      |      |      ||      |      |   # Execution count [1.25e+01]
 | 
			
		||||
 300 | 1.00   8.00 |             |             |             |      |      |      |      |      |      || 13.0 |      |   vdivpd    %ymm14, %ymm4, %ymm7                          #75.39
 | 
			
		||||
 301 | 0.50        | 0.50        | 0.50   0.50 | 0.50   0.50 |      |      |      |      |      |      ||  4.0 |      |   vmulpd    96(%rsp), %ymm7, %ymm14                       #76.38[spill]
 | 
			
		||||
 302 | 0.50        | 0.50        |             |             |      |      |      |      |      |      ||  4.0 |      |   vmulpd    %ymm14, %ymm7, %ymm14                         #76.44
 | 
			
		||||
 303 | 0.50        | 0.50        |             |             |      |      |      |      |      |      ||  4.0 |      |   vmulpd    %ymm14, %ymm7, %ymm15                         #76.50
 | 
			
		||||
 304 | 0.50        | 0.50        |             |             |      |      |      |      |      |      ||      |      |   vfmsub213pd %ymm3, %ymm7, %ymm14                        #77.55
 | 
			
		||||
 305 | 0.50        | 0.50        | 0.50   0.50 | 0.50   0.50 |      |      |      |      |      |      ||      |      |   vmulpd    64(%rsp), %ymm7, %ymm7                        #77.55[spill]
 | 
			
		||||
 306 | 0.50        | 0.50        |             |             |      |      |      |      |      |      ||  4.0 |      |   vmulpd    %ymm7, %ymm15, %ymm15                         #77.64
 | 
			
		||||
 307 | 0.50        | 0.50        |             |             |      |      |      |      |      |      ||  4.0 |      |   vmulpd    %ymm14, %ymm15, %ymm7                         #77.70
 | 
			
		||||
 308 | 0.50        | 0.50        |             |             |      |      |      |      |      |      ||  4.0 |      |   vmulpd    %ymm7, %ymm6, %ymm6                           #78.31
 | 
			
		||||
 309 | 0.00        | 1.00        |             |             |      |      |      |      |      |      ||      |      |   vmulpd    %ymm7, %ymm2, %ymm2                           #79.31
 | 
			
		||||
 310 | 0.00        | 0.00        |             |             |      | 1.00 |      |      |      |      ||  1.0 |      |   vandpd    %ymm6, %ymm1, %ymm6                           #78.31
 | 
			
		||||
 311 | 0.00        | 1.00        |             |             |      |      |      |      |      |      ||  4.0 |      |   vaddpd    %ymm6, %ymm13, %ymm13                         #78.17
 | 
			
		||||
 312 | 0.00        | 1.00        |             |             |      |      |      |      |      |      ||      |      |   vmulpd    %ymm7, %ymm0, %ymm6                           #80.31
 | 
			
		||||
 313 | 0.00        | 0.00        |             |             |      | 1.00 |      |      |      |      ||      |      |   vandpd    %ymm2, %ymm1, %ymm0                           #79.31
 | 
			
		||||
 314 | 0.00        | 0.00        |             |             |      | 1.00 |      |      |      |      ||      |      |   vandpd    %ymm6, %ymm1, %ymm1                           #80.31
 | 
			
		||||
 315 | 0.00        | 1.00        |             |             |      |      |      |      |      |      ||      |      |   vaddpd    %ymm0, %ymm12, %ymm12                         #79.17
 | 
			
		||||
 316 | 0.00        | 1.00        |             |             |      |      |      |      |      |      ||      |  4.0 |   vaddpd    %ymm1, %ymm11, %ymm11                         #80.17
 | 
			
		||||
 317 |             |             |             |             |      |      |      |      |      |      ||      |      |   # LOE rax rdx rbx rsi rdi r9 r10 r11 r12 r13d ymm3 ymm4 ymm5 ymm8 ymm9 ymm10 ymm11 ymm12 ymm13
 | 
			
		||||
 318 |             |             |             |             |      |      |      |      |      |      ||      |      |   ..B1.24:                        # Preds ..B1.23 ..B1.22
 | 
			
		||||
 319 |             |             |             |             |      |      |      |      |      |      ||      |      |   # Execution count [2.50e+01]
 | 
			
		||||
 320 | 0.00        | 0.00        |             |             |      | 0.00 | 1.00 |      |      |      ||      |      |   addq      $4, %rdx                                      #59.9
 | 
			
		||||
 321 | 0.00        | 0.00        |             |             |      | 0.00 | 1.00 |      |      |      ||      |      |   cmpq      %rsi, %rdx                                    #59.9
 | 
			
		||||
 322 |             |             |             |             |      |      |      |      |      |      ||      |      | * jb        ..B1.22       # Prob 82%                      #59.9
 | 
			
		||||
 323 |             |             |             |             |      |      |      |      |      |      ||      |      |   # LLVM-MCA-END
 | 
			
		||||
 | 
			
		||||
       12.8   8.00   12.8          5.50   5.50   5.50   5.50          12.8   12.8                           81    4.0  
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Loop-Carried Dependencies Analysis Report
 | 
			
		||||
-----------------------------------------
 | 
			
		||||
 316 |  4.0 | vaddpd    %ymm1, %ymm11, %ymm11                         #80.17| [316]
 | 
			
		||||
 315 |  4.0 | vaddpd    %ymm0, %ymm12, %ymm12                         #79.17| [315]
 | 
			
		||||
 311 |  4.0 | vaddpd    %ymm6, %ymm13, %ymm13                         #78.17| [311]
 | 
			
		||||
 320 |  1.0 | addq      $4, %rdx                                      #59.9| [320]
 | 
			
		||||
 | 
			
		||||
@@ -1,75 +0,0 @@
 | 
			
		||||
Intel(R) Architecture Code Analyzer Version -  v3.0-28-g1ba2cbb build date: 2017-10-23;16:42:45
 | 
			
		||||
Analyzed File -  lammps-icc-avx512.o
 | 
			
		||||
Binary Format - 64Bit
 | 
			
		||||
Architecture  -  SKX
 | 
			
		||||
Analysis Type - Throughput
 | 
			
		||||
 | 
			
		||||
Throughput Analysis Report
 | 
			
		||||
--------------------------
 | 
			
		||||
Block Throughput: 30.89 Cycles       Throughput Bottleneck: Backend
 | 
			
		||||
Loop Count:  22
 | 
			
		||||
Port Binding In Cycles Per Iteration:
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
|  Port  |   0   -  DV   |   1   |   2   -  D    |   3   -  D    |   4   |   5   |   6   |   7   |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
| Cycles | 19.0     0.0  |  4.0  | 13.0    13.0  | 13.0    13.0  |  0.0  | 17.0  |  4.0  |  0.0  |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
 | 
			
		||||
DV - Divider pipe (on port 0)
 | 
			
		||||
D - Data fetch pipe (on ports 2 and 3)
 | 
			
		||||
F - Macro Fusion with the previous instruction occurred
 | 
			
		||||
* - instruction micro-ops not bound to a port
 | 
			
		||||
^ - Micro Fusion occurred
 | 
			
		||||
# - ESP Tracking sync uop was issued
 | 
			
		||||
@ - SSE instruction followed an AVX256/AVX512 instruction, dozens of cycles penalty is expected
 | 
			
		||||
X - instruction not supported, was not accounted in Analysis
 | 
			
		||||
 | 
			
		||||
| Num Of   |                    Ports pressure in cycles                         |      |
 | 
			
		||||
|  Uops    |  0  - DV    |  1   |  2  -  D    |  3  -  D    |  4   |  5   |  6   |  7   |
 | 
			
		||||
-----------------------------------------------------------------------------------------
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vpcmpgtd k5, ymm3, ymm4
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | vpaddd ymm4, ymm4, ymm15
 | 
			
		||||
|   2      |             | 1.0  | 1.0     1.0 |             |      |      |      |      | vmovdqu32 ymm17{k5}{z}, ymmword ptr [r10+r15*4]
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | vpaddd ymm18, ymm17, ymm17
 | 
			
		||||
|   1      |             |      |             |             |      |      | 1.0  |      | add r15, 0x8
 | 
			
		||||
|   1      |             | 1.0  |             |             |      |      |      |      | vpaddd ymm19, ymm17, ymm18
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kmovw k2, k5
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kmovw k3, k5
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | kmovw k1, k5
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | vpxord zmm21, zmm21, zmm21
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | vpxord zmm20, zmm20, zmm20
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | vpxord zmm22, zmm22, zmm22
 | 
			
		||||
|   5^     | 1.0         |      | 4.0     4.0 | 4.0     4.0 |      | 1.0  | 1.0  |      | vgatherdpd zmm21, k2, zmmword ptr [rbx+ymm19*8+0x8]
 | 
			
		||||
|   5^     | 1.0         |      | 4.0     4.0 | 4.0     4.0 |      | 1.0  | 1.0  |      | vgatherdpd zmm20, k3, zmmword ptr [rbx+ymm19*8]
 | 
			
		||||
|   5^     | 1.0         |      | 4.0     4.0 | 4.0     4.0 |      | 1.0  | 1.0  |      | vgatherdpd zmm22, k1, zmmword ptr [rbx+ymm19*8+0x10]
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vsubpd zmm18, zmm1, zmm21
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vsubpd zmm17, zmm2, zmm20
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vsubpd zmm19, zmm0, zmm22
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulpd zmm31, zmm18, zmm18
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231pd zmm31, zmm17, zmm17
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231pd zmm31, zmm19, zmm19
 | 
			
		||||
|   3      | 2.0         |      |             |             |      | 1.0  |      |      | vrcp14pd zmm30, zmm31
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vcmppd k6{k5}, zmm31, zmm14, 0x1
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vfpclasspd k0, zmm30, 0x1e
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | vmovaps zmm23, zmm31
 | 
			
		||||
|   2^     | 1.0         |      |             | 1.0     1.0 |      |      |      |      | vfnmadd213pd zmm23, zmm30, qword ptr [rip]{1to8}
 | 
			
		||||
|   1      | 1.0         |      |             |             |      |      |      |      | knotw k4, k0
 | 
			
		||||
|   1      |             |      |             |             |      | 1.0  |      |      | vmulpd zmm24, zmm23, zmm23
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd213pd zmm30{k4}, zmm23, zmm30
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd213pd zmm30{k4}, zmm24, zmm30
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulpd zmm25, zmm30, zmm13
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulpd zmm27, zmm30, zmm12
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulpd zmm28, zmm30, zmm25
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulpd zmm26, zmm30, zmm28
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmsub213pd zmm30, zmm28, zmm5
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulpd zmm29, zmm26, zmm27
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vmulpd zmm23, zmm29, zmm30
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231pd zmm10{k6}, zmm23, zmm17
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231pd zmm9{k6}, zmm23, zmm18
 | 
			
		||||
|   1      | 0.5         |      |             |             |      | 0.5  |      |      | vfmadd231pd zmm8{k6}, zmm23, zmm19
 | 
			
		||||
|   1*     |             |      |             |             |      |      |      |      | cmp r15, r14
 | 
			
		||||
|   0*F    |             |      |             |             |      |      |      |      | jb 0xffffffffffffff0c
 | 
			
		||||
Total Num Of Uops: 57
 | 
			
		||||
Analysis Notes:
 | 
			
		||||
Backend allocation was stalled due to unavailable allocation resources.
 | 
			
		||||
There were bubbles in the frontend.
 | 
			
		||||
@@ -1,128 +0,0 @@
 | 
			
		||||
 | 
			
		||||
[0] Code Region
 | 
			
		||||
 | 
			
		||||
Iterations:        100
 | 
			
		||||
Instructions:      4200
 | 
			
		||||
Total Cycles:      2465
 | 
			
		||||
Total uOps:        5800
 | 
			
		||||
 | 
			
		||||
Dispatch Width:    6
 | 
			
		||||
uOps Per Cycle:    2.35
 | 
			
		||||
IPC:               1.70
 | 
			
		||||
Block RThroughput: 13.0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Instruction Info:
 | 
			
		||||
[1]: #uOps
 | 
			
		||||
[2]: Latency
 | 
			
		||||
[3]: RThroughput
 | 
			
		||||
[4]: MayLoad
 | 
			
		||||
[5]: MayStore
 | 
			
		||||
[6]: HasSideEffects (U)
 | 
			
		||||
 | 
			
		||||
[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 | 
			
		||||
 1      4     1.00                        vpcmpgtd	%ymm4, %ymm3, %k5
 | 
			
		||||
 1      1     0.33                        vpaddd	%ymm15, %ymm4, %ymm4
 | 
			
		||||
 2      8     0.50    *                   vmovdqu32	(%r10,%r15,4), %ymm17 {%k5} {z}
 | 
			
		||||
 1      1     0.33                        vpaddd	%ymm17, %ymm17, %ymm18
 | 
			
		||||
 1      1     0.25                        addq	$8, %r15
 | 
			
		||||
 1      1     0.33                        vpaddd	%ymm18, %ymm17, %ymm19
 | 
			
		||||
 1      1     1.00                        kmovw	%k5, %k2
 | 
			
		||||
 1      1     1.00                        kmovw	%k5, %k3
 | 
			
		||||
 1      1     1.00                        kmovw	%k5, %k1
 | 
			
		||||
 1      0     0.17                        vpxord	%zmm21, %zmm21, %zmm21
 | 
			
		||||
 1      0     0.17                        vpxord	%zmm20, %zmm20, %zmm20
 | 
			
		||||
 1      0     0.17                        vpxord	%zmm22, %zmm22, %zmm22
 | 
			
		||||
 5      21    4.00    *                   vgatherdpd	8(%rbx,%ymm19,8), %zmm21 {%k2}
 | 
			
		||||
 5      21    4.00    *                   vgatherdpd	(%rbx,%ymm19,8), %zmm20 {%k3}
 | 
			
		||||
 5      21    4.00    *                   vgatherdpd	16(%rbx,%ymm19,8), %zmm22 {%k1}
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm21, %zmm1, %zmm18
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm20, %zmm2, %zmm17
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm22, %zmm0, %zmm19
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm18, %zmm18, %zmm31
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm17, %zmm17, %zmm31
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm19, %zmm19, %zmm31
 | 
			
		||||
 3      4     2.00                        vrcp14pd	%zmm31, %zmm30
 | 
			
		||||
 1      4     1.00                        vcmpltpd	%zmm14, %zmm31, %k6 {%k5}
 | 
			
		||||
 1      4     1.00                        vfpclasspd	$30, %zmm30, %k0
 | 
			
		||||
 1      1     0.50                        vmovaps	%zmm31, %zmm23
 | 
			
		||||
 2      11    0.50    *                   vfnmadd213pd	.L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
 | 
			
		||||
 1      1     1.00                        knotw	%k0, %k4
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm23, %zmm23, %zmm24
 | 
			
		||||
 1      4     0.50                        vfmadd213pd	%zmm30, %zmm23, %zmm30 {%k4}
 | 
			
		||||
 1      4     0.50                        vfmadd213pd	%zmm30, %zmm24, %zmm30 {%k4}
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm13, %zmm30, %zmm25
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm12, %zmm30, %zmm27
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm25, %zmm30, %zmm28
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm28, %zmm30, %zmm26
 | 
			
		||||
 1      4     0.50                        vfmsub213pd	%zmm5, %zmm28, %zmm30
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm27, %zmm26, %zmm29
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm30, %zmm29, %zmm23
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm17, %zmm23, %zmm10 {%k6}
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm18, %zmm23, %zmm9 {%k6}
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm19, %zmm23, %zmm8 {%k6}
 | 
			
		||||
 1      1     0.25                        cmpq	%r14, %r15
 | 
			
		||||
 1      1     0.50                        jb	..B1.16
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Resources:
 | 
			
		||||
[0]   - SKXDivider
 | 
			
		||||
[1]   - SKXFPDivider
 | 
			
		||||
[2]   - SKXPort0
 | 
			
		||||
[3]   - SKXPort1
 | 
			
		||||
[4]   - SKXPort2
 | 
			
		||||
[5]   - SKXPort3
 | 
			
		||||
[6]   - SKXPort4
 | 
			
		||||
[7]   - SKXPort5
 | 
			
		||||
[8]   - SKXPort6
 | 
			
		||||
[9]   - SKXPort7
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Resource pressure per iteration:
 | 
			
		||||
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    
 | 
			
		||||
 -      -     19.02  6.79   12.64  13.36   -     16.03  5.16    -     
 | 
			
		||||
 | 
			
		||||
Resource pressure by instruction:
 | 
			
		||||
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vpcmpgtd	%ymm4, %ymm3, %k5
 | 
			
		||||
 -      -     0.28   0.72    -      -      -      -      -      -     vpaddd	%ymm15, %ymm4, %ymm4
 | 
			
		||||
 -      -     0.14   0.71   0.55   0.45    -     0.15    -      -     vmovdqu32	(%r10,%r15,4), %ymm17 {%k5} {z}
 | 
			
		||||
 -      -      -     0.97    -      -      -     0.03    -      -     vpaddd	%ymm17, %ymm17, %ymm18
 | 
			
		||||
 -      -     0.14   0.41    -      -      -     0.13   0.32    -     addq	$8, %r15
 | 
			
		||||
 -      -      -     0.99    -      -      -     0.01    -      -     vpaddd	%ymm18, %ymm17, %ymm19
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     kmovw	%k5, %k2
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     kmovw	%k5, %k3
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     kmovw	%k5, %k1
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -     vpxord	%zmm21, %zmm21, %zmm21
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -     vpxord	%zmm20, %zmm20, %zmm20
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -     vpxord	%zmm22, %zmm22, %zmm22
 | 
			
		||||
 -      -     1.00   0.99   3.52   4.48    -     0.01   1.00    -     vgatherdpd	8(%rbx,%ymm19,8), %zmm21 {%k2}
 | 
			
		||||
 -      -     1.00   0.99   4.48   3.52    -     0.01   1.00    -     vgatherdpd	(%rbx,%ymm19,8), %zmm20 {%k3}
 | 
			
		||||
 -      -     1.00   1.00   3.52   4.48    -      -     1.00    -     vgatherdpd	16(%rbx,%ymm19,8), %zmm22 {%k1}
 | 
			
		||||
 -      -     0.02    -      -      -      -     0.98    -      -     vsubpd	%zmm21, %zmm1, %zmm18
 | 
			
		||||
 -      -     0.17    -      -      -      -     0.83    -      -     vsubpd	%zmm20, %zmm2, %zmm17
 | 
			
		||||
 -      -     0.18    -      -      -      -     0.82    -      -     vsubpd	%zmm22, %zmm0, %zmm19
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.99    -      -     vmulpd	%zmm18, %zmm18, %zmm31
 | 
			
		||||
 -      -     0.69    -      -      -      -     0.31    -      -     vfmadd231pd	%zmm17, %zmm17, %zmm31
 | 
			
		||||
 -      -     0.68    -      -      -      -     0.32    -      -     vfmadd231pd	%zmm19, %zmm19, %zmm31
 | 
			
		||||
 -      -     2.00    -      -      -      -     1.00    -      -     vrcp14pd	%zmm31, %zmm30
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vcmpltpd	%zmm14, %zmm31, %k6 {%k5}
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -     vfpclasspd	$30, %zmm30, %k0
 | 
			
		||||
 -      -     0.83    -      -      -      -     0.17    -      -     vmovaps	%zmm31, %zmm23
 | 
			
		||||
 -      -     1.00    -     0.57   0.43    -      -      -      -     vfnmadd213pd	.L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -     knotw	%k0, %k4
 | 
			
		||||
 -      -     0.44    -      -      -      -     0.56    -      -     vmulpd	%zmm23, %zmm23, %zmm24
 | 
			
		||||
 -      -     0.56    -      -      -      -     0.44    -      -     vfmadd213pd	%zmm30, %zmm23, %zmm30 {%k4}
 | 
			
		||||
 -      -     0.55    -      -      -      -     0.45    -      -     vfmadd213pd	%zmm30, %zmm24, %zmm30 {%k4}
 | 
			
		||||
 -      -     0.69    -      -      -      -     0.31    -      -     vmulpd	%zmm13, %zmm30, %zmm25
 | 
			
		||||
 -      -     0.31    -      -      -      -     0.69    -      -     vmulpd	%zmm12, %zmm30, %zmm27
 | 
			
		||||
 -      -     0.56    -      -      -      -     0.44    -      -     vmulpd	%zmm25, %zmm30, %zmm28
 | 
			
		||||
 -      -     0.02    -      -      -      -     0.98    -      -     vmulpd	%zmm28, %zmm30, %zmm26
 | 
			
		||||
 -      -     0.98    -      -      -      -     0.02    -      -     vfmsub213pd	%zmm5, %zmm28, %zmm30
 | 
			
		||||
 -      -     0.30    -      -      -      -     0.70    -      -     vmulpd	%zmm27, %zmm26, %zmm29
 | 
			
		||||
 -      -     0.16    -      -      -      -     0.84    -      -     vmulpd	%zmm30, %zmm29, %zmm23
 | 
			
		||||
 -      -     0.17    -      -      -      -     0.83    -      -     vfmadd231pd	%zmm17, %zmm23, %zmm10 {%k6}
 | 
			
		||||
 -      -     0.83    -      -      -      -     0.17    -      -     vfmadd231pd	%zmm18, %zmm23, %zmm9 {%k6}
 | 
			
		||||
 -      -     0.17    -      -      -      -     0.83    -      -     vfmadd231pd	%zmm19, %zmm23, %zmm8 {%k6}
 | 
			
		||||
 -      -      -     0.01    -      -      -     0.01   0.98    -     cmpq	%r14, %r15
 | 
			
		||||
 -      -     0.14    -      -      -      -      -     0.86    -     jb	..B1.16
 | 
			
		||||
@@ -1,130 +0,0 @@
 | 
			
		||||
 | 
			
		||||
[0] Code Region
 | 
			
		||||
 | 
			
		||||
Iterations:        100
 | 
			
		||||
Instructions:      4200
 | 
			
		||||
Total Cycles:      2465
 | 
			
		||||
Total uOps:        5800
 | 
			
		||||
 | 
			
		||||
Dispatch Width:    6
 | 
			
		||||
uOps Per Cycle:    2.35
 | 
			
		||||
IPC:               1.70
 | 
			
		||||
Block RThroughput: 13.0
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Instruction Info:
 | 
			
		||||
[1]: #uOps
 | 
			
		||||
[2]: Latency
 | 
			
		||||
[3]: RThroughput
 | 
			
		||||
[4]: MayLoad
 | 
			
		||||
[5]: MayStore
 | 
			
		||||
[6]: HasSideEffects (U)
 | 
			
		||||
 | 
			
		||||
[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 | 
			
		||||
 1      4     1.00                        vpcmpgtd	%ymm4, %ymm3, %k5
 | 
			
		||||
 1      1     0.33                        vpaddd	%ymm15, %ymm4, %ymm4
 | 
			
		||||
 2      8     0.50    *                   vmovdqu32	(%r10,%r15,4), %ymm17 {%k5} {z}
 | 
			
		||||
 1      1     0.33                        vpaddd	%ymm17, %ymm17, %ymm18
 | 
			
		||||
 1      1     0.25                        addq	$8, %r15
 | 
			
		||||
 1      1     0.33                        vpaddd	%ymm18, %ymm17, %ymm19
 | 
			
		||||
 1      1     1.00                        kmovw	%k5, %k2
 | 
			
		||||
 1      1     1.00                        kmovw	%k5, %k3
 | 
			
		||||
 1      1     1.00                        kmovw	%k5, %k1
 | 
			
		||||
 1      0     0.17                        vpxord	%zmm21, %zmm21, %zmm21
 | 
			
		||||
 1      0     0.17                        vpxord	%zmm20, %zmm20, %zmm20
 | 
			
		||||
 1      0     0.17                        vpxord	%zmm22, %zmm22, %zmm22
 | 
			
		||||
 5      21    4.00    *                   vgatherdpd	8(%rbx,%ymm19,8), %zmm21 {%k2}
 | 
			
		||||
 5      21    4.00    *                   vgatherdpd	(%rbx,%ymm19,8), %zmm20 {%k3}
 | 
			
		||||
 5      21    4.00    *                   vgatherdpd	16(%rbx,%ymm19,8), %zmm22 {%k1}
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm21, %zmm1, %zmm18
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm20, %zmm2, %zmm17
 | 
			
		||||
 1      4     0.50                        vsubpd	%zmm22, %zmm0, %zmm19
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm18, %zmm18, %zmm31
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm17, %zmm17, %zmm31
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm19, %zmm19, %zmm31
 | 
			
		||||
 3      4     2.00                        vrcp14pd	%zmm31, %zmm30
 | 
			
		||||
 1      4     1.00                        vcmpltpd	%zmm14, %zmm31, %k6 {%k5}
 | 
			
		||||
 1      4     1.00                        vfpclasspd	$30, %zmm30, %k0
 | 
			
		||||
 1      1     0.50                        vmovaps	%zmm31, %zmm23
 | 
			
		||||
 2      11    0.50    *                   vfnmadd213pd	.L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
 | 
			
		||||
 1      1     1.00                        knotw	%k0, %k4
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm23, %zmm23, %zmm24
 | 
			
		||||
 1      4     0.50                        vfmadd213pd	%zmm30, %zmm23, %zmm30 {%k4}
 | 
			
		||||
 1      4     0.50                        vfmadd213pd	%zmm30, %zmm24, %zmm30 {%k4}
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm13, %zmm30, %zmm25
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm12, %zmm30, %zmm27
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm25, %zmm30, %zmm28
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm28, %zmm30, %zmm26
 | 
			
		||||
 1      4     0.50                        vfmsub213pd	%zmm5, %zmm28, %zmm30
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm27, %zmm26, %zmm29
 | 
			
		||||
 1      4     0.50                        vmulpd	%zmm30, %zmm29, %zmm23
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm17, %zmm23, %zmm10 {%k6}
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm18, %zmm23, %zmm9 {%k6}
 | 
			
		||||
 1      4     0.50                        vfmadd231pd	%zmm19, %zmm23, %zmm8 {%k6}
 | 
			
		||||
 1      1     0.25                        cmpq	%r14, %r15
 | 
			
		||||
 1      1     0.50                        jb	..B1.16
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Resources:
 | 
			
		||||
[0]   - ICXDivider
 | 
			
		||||
[1]   - ICXFPDivider
 | 
			
		||||
[2]   - ICXPort0
 | 
			
		||||
[3]   - ICXPort1
 | 
			
		||||
[4]   - ICXPort2
 | 
			
		||||
[5]   - ICXPort3
 | 
			
		||||
[6]   - ICXPort4
 | 
			
		||||
[7]   - ICXPort5
 | 
			
		||||
[8]   - ICXPort6
 | 
			
		||||
[9]   - ICXPort7
 | 
			
		||||
[10]  - ICXPort8
 | 
			
		||||
[11]  - ICXPort9
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Resource pressure per iteration:
 | 
			
		||||
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   
 | 
			
		||||
 -      -     19.02  6.79   12.64  13.36   -     16.03  5.16    -      -      -     
 | 
			
		||||
 | 
			
		||||
Resource pressure by instruction:
 | 
			
		||||
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -      -      -     vpcmpgtd	%ymm4, %ymm3, %k5
 | 
			
		||||
 -      -     0.28   0.72    -      -      -      -      -      -      -      -     vpaddd	%ymm15, %ymm4, %ymm4
 | 
			
		||||
 -      -     0.14   0.71   0.55   0.45    -     0.15    -      -      -      -     vmovdqu32	(%r10,%r15,4), %ymm17 {%k5} {z}
 | 
			
		||||
 -      -      -     0.97    -      -      -     0.03    -      -      -      -     vpaddd	%ymm17, %ymm17, %ymm18
 | 
			
		||||
 -      -     0.14   0.41    -      -      -     0.13   0.32    -      -      -     addq	$8, %r15
 | 
			
		||||
 -      -      -     0.99    -      -      -     0.01    -      -      -      -     vpaddd	%ymm18, %ymm17, %ymm19
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -      -      -     kmovw	%k5, %k2
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -      -      -     kmovw	%k5, %k3
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -      -      -     kmovw	%k5, %k1
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     vpxord	%zmm21, %zmm21, %zmm21
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     vpxord	%zmm20, %zmm20, %zmm20
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     vpxord	%zmm22, %zmm22, %zmm22
 | 
			
		||||
 -      -     1.00   0.99   3.52   4.48    -     0.01   1.00    -      -      -     vgatherdpd	8(%rbx,%ymm19,8), %zmm21 {%k2}
 | 
			
		||||
 -      -     1.00   0.99   4.48   3.52    -     0.01   1.00    -      -      -     vgatherdpd	(%rbx,%ymm19,8), %zmm20 {%k3}
 | 
			
		||||
 -      -     1.00   1.00   3.52   4.48    -      -     1.00    -      -      -     vgatherdpd	16(%rbx,%ymm19,8), %zmm22 {%k1}
 | 
			
		||||
 -      -     0.02    -      -      -      -     0.98    -      -      -      -     vsubpd	%zmm21, %zmm1, %zmm18
 | 
			
		||||
 -      -     0.17    -      -      -      -     0.83    -      -      -      -     vsubpd	%zmm20, %zmm2, %zmm17
 | 
			
		||||
 -      -     0.18    -      -      -      -     0.82    -      -      -      -     vsubpd	%zmm22, %zmm0, %zmm19
 | 
			
		||||
 -      -     0.01    -      -      -      -     0.99    -      -      -      -     vmulpd	%zmm18, %zmm18, %zmm31
 | 
			
		||||
 -      -     0.69    -      -      -      -     0.31    -      -      -      -     vfmadd231pd	%zmm17, %zmm17, %zmm31
 | 
			
		||||
 -      -     0.68    -      -      -      -     0.32    -      -      -      -     vfmadd231pd	%zmm19, %zmm19, %zmm31
 | 
			
		||||
 -      -     2.00    -      -      -      -     1.00    -      -      -      -     vrcp14pd	%zmm31, %zmm30
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -      -      -     vcmpltpd	%zmm14, %zmm31, %k6 {%k5}
 | 
			
		||||
 -      -      -      -      -      -      -     1.00    -      -      -      -     vfpclasspd	$30, %zmm30, %k0
 | 
			
		||||
 -      -     0.83    -      -      -      -     0.17    -      -      -      -     vmovaps	%zmm31, %zmm23
 | 
			
		||||
 -      -     1.00    -     0.57   0.43    -      -      -      -      -      -     vfnmadd213pd	.L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23
 | 
			
		||||
 -      -     1.00    -      -      -      -      -      -      -      -      -     knotw	%k0, %k4
 | 
			
		||||
 -      -     0.44    -      -      -      -     0.56    -      -      -      -     vmulpd	%zmm23, %zmm23, %zmm24
 | 
			
		||||
 -      -     0.56    -      -      -      -     0.44    -      -      -      -     vfmadd213pd	%zmm30, %zmm23, %zmm30 {%k4}
 | 
			
		||||
 -      -     0.55    -      -      -      -     0.45    -      -      -      -     vfmadd213pd	%zmm30, %zmm24, %zmm30 {%k4}
 | 
			
		||||
 -      -     0.69    -      -      -      -     0.31    -      -      -      -     vmulpd	%zmm13, %zmm30, %zmm25
 | 
			
		||||
 -      -     0.31    -      -      -      -     0.69    -      -      -      -     vmulpd	%zmm12, %zmm30, %zmm27
 | 
			
		||||
 -      -     0.56    -      -      -      -     0.44    -      -      -      -     vmulpd	%zmm25, %zmm30, %zmm28
 | 
			
		||||
 -      -     0.02    -      -      -      -     0.98    -      -      -      -     vmulpd	%zmm28, %zmm30, %zmm26
 | 
			
		||||
 -      -     0.98    -      -      -      -     0.02    -      -      -      -     vfmsub213pd	%zmm5, %zmm28, %zmm30
 | 
			
		||||
 -      -     0.30    -      -      -      -     0.70    -      -      -      -     vmulpd	%zmm27, %zmm26, %zmm29
 | 
			
		||||
 -      -     0.16    -      -      -      -     0.84    -      -      -      -     vmulpd	%zmm30, %zmm29, %zmm23
 | 
			
		||||
 -      -     0.17    -      -      -      -     0.83    -      -      -      -     vfmadd231pd	%zmm17, %zmm23, %zmm10 {%k6}
 | 
			
		||||
 -      -     0.83    -      -      -      -     0.17    -      -      -      -     vfmadd231pd	%zmm18, %zmm23, %zmm9 {%k6}
 | 
			
		||||
 -      -     0.17    -      -      -      -     0.83    -      -      -      -     vfmadd231pd	%zmm19, %zmm23, %zmm8 {%k6}
 | 
			
		||||
 -      -      -     0.01    -      -      -     0.01   0.98    -      -      -     cmpq	%r14, %r15
 | 
			
		||||
 -      -     0.14    -      -      -      -      -     0.86    -      -      -     jb	..B1.16
 | 
			
		||||
@@ -1,77 +0,0 @@
 | 
			
		||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
 | 
			
		||||
Analyzed file:      lammps-icc-avx512.s
 | 
			
		||||
Architecture:       CSX
 | 
			
		||||
Timestamp:          2023-02-10 16:30:08
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
 | 
			
		||||
 * - Instruction micro-ops not bound to a port
 | 
			
		||||
 X - No throughput/latency information for this instruction in data file
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Combined Analysis Report
 | 
			
		||||
------------------------
 | 
			
		||||
                                      Port pressure in cycles                                      
 | 
			
		||||
     |  0   - 0DV  |  1   |  2   -  2D  |  3   -  3D  |  4   |  5   |  6   |  7   ||  CP  | LCD  |
 | 
			
		||||
--------------------------------------------------------------------------------------------------
 | 
			
		||||
 200 |             |      |             |             |      |      |      |      ||      |      |   # pointer_increment=64 1303ca335e79351a96cfc07b8b9ec9d4
 | 
			
		||||
 201 |             |      |             |             |      |      |      |      ||      |      |   # LLVM-MCA-BEGIN
 | 
			
		||||
 202 |             |      |             |             |      |      |      |      ||      |      |   ..B1.16:                        # Preds ..B1.16 ..B1.15
 | 
			
		||||
 203 |             |      |             |             |      |      |      |      ||      |      |   # Execution count [2.50e+01]
 | 
			
		||||
 204 |             |      |             |             |      | 1.00 |      |      ||      |      |   vpcmpgtd  %ymm4, %ymm3, %k5                             #59.9
 | 
			
		||||
 205 | 0.00        | 1.00 |             |             |      | 0.00 |      |      ||      |      |   vpaddd    %ymm15, %ymm4, %ymm4                          #59.9
 | 
			
		||||
 206 | 0.00        | 1.00 | 0.50   0.50 | 0.50   0.50 |      | 0.00 |      |      ||  0.0 |      |   vmovdqu32 (%r10,%r15,4), %ymm17{%k5}{z}                 #60.21
 | 
			
		||||
 207 | 0.00        | 1.00 |             |             |      | 0.00 |      |      ||  1.0 |      |   vpaddd    %ymm17, %ymm17, %ymm18                        #61.36
 | 
			
		||||
 208 | 0.00        | 0.16 |             |             |      | 0.00 | 0.84 |      ||      |      |   addq      $8, %r15                                      #59.9
 | 
			
		||||
 209 | 0.00        | 1.00 |             |             |      | 0.00 |      |      ||  1.0 |      |   vpaddd    %ymm18, %ymm17, %ymm19                        #61.36
 | 
			
		||||
 210 | 1.00        |      |             |             |      |      |      |      ||      |      |   kmovw     %k5, %k2                                      #61.36
 | 
			
		||||
 211 | 1.00        |      |             |             |      |      |      |      ||      |      |   kmovw     %k5, %k3                                      #61.36
 | 
			
		||||
 212 | 1.00        |      |             |             |      |      |      |      ||      |      |   kmovw     %k5, %k1                                      #61.36
 | 
			
		||||
 213 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vpxord    %zmm21, %zmm21, %zmm21                        #61.36
 | 
			
		||||
 214 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vpxord    %zmm20, %zmm20, %zmm20                        #61.36
 | 
			
		||||
 215 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vpxord    %zmm22, %zmm22, %zmm22                        #61.36
 | 
			
		||||
 216 | 1.25        | 0.75 | 5.00   5.00 | 5.00   5.00 |      | 0.25 | 0.75 |      || 24.0 |      |   vgatherdpd 8(%rbx,%ymm19,8), %zmm21{%k2}                #61.36
 | 
			
		||||
 217 | 1.25        | 0.25 | 5.00   5.00 | 5.00   5.00 |      | 0.25 | 1.25 |      ||      |      |   vgatherdpd (%rbx,%ymm19,8), %zmm20{%k3}                 #61.36
 | 
			
		||||
 218 | 1.25        | 0.09 | 5.00   5.00 | 5.00   5.00 |      | 0.25 | 1.41 |      ||      |      |   vgatherdpd 16(%rbx,%ymm19,8), %zmm22{%k1}               #61.36
 | 
			
		||||
 219 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vsubpd    %zmm21, %zmm1, %zmm18                         #62.36
 | 
			
		||||
 220 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd    %zmm20, %zmm2, %zmm17                         #61.36
 | 
			
		||||
 221 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vsubpd    %zmm22, %zmm0, %zmm19                         #63.36
 | 
			
		||||
 222 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd    %zmm18, %zmm18, %zmm31                        #64.49
 | 
			
		||||
 223 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vfmadd231pd %zmm17, %zmm17, %zmm31                      #64.49
 | 
			
		||||
 224 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vfmadd231pd %zmm19, %zmm19, %zmm31                      #64.63
 | 
			
		||||
 225 | 2.50        |      |             |             |      | 0.50 |      |      ||  8.0 |      |   vrcp14pd  %zmm31, %zmm30                                #75.39
 | 
			
		||||
 226 |             |      |             |             |      | 1.00 |      |      ||      |      |   vcmppd    $1, %zmm14, %zmm31, %k6{%k5}                  #74.22
 | 
			
		||||
 227 |             |      |             |             |      | 1.00 |      |      ||      |      |   vfpclasspd $30, %zmm30, %k0                             #75.39
 | 
			
		||||
 228 |             |      |             |             |      |      |      |      ||      |      | * vmovaps   %zmm31, %zmm23                                #75.39
 | 
			
		||||
 229 | 0.50        |      | 0.50   0.50 | 0.50   0.50 |      | 0.50 |      |      ||  4.0 |      |   vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23 #75.39
 | 
			
		||||
 230 | 1.00        |      |             |             |      |      |      |      ||      |      |   knotw     %k0, %k4                                      #75.39
 | 
			
		||||
 231 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd    %zmm23, %zmm23, %zmm24                        #75.39
 | 
			
		||||
 232 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vfmadd213pd %zmm30, %zmm23, %zmm30{%k4}                 #75.39
 | 
			
		||||
 233 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vfmadd213pd %zmm30, %zmm24, %zmm30{%k4}                 #75.39
 | 
			
		||||
 234 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd    %zmm13, %zmm30, %zmm25                        #76.38
 | 
			
		||||
 235 | 0.50        |      |             |             |      | 0.50 |      |      ||      |      |   vmulpd    %zmm12, %zmm30, %zmm27                        #77.55
 | 
			
		||||
 236 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd    %zmm25, %zmm30, %zmm28                        #76.44
 | 
			
		||||
 237 | 0.50        |      |             |             |      | 0.50 |      |      ||  4.0 |      |   vmulpd    %zmm28, %zmm30, %zmm26                        #76.50
 | 
			
		||||
 238 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vfmsub213pd %zmm5, %zmm28, %zmm30                       #77.55
 | 
			
		||||
 239 | 0.00        |      |             |             |      | 1.00 |      |      ||  4.0 |      |   vmulpd    %zmm27, %zmm26, %zmm29                        #77.64
 | 
			
		||||
 240 | 0.00        |      |             |             |      | 1.00 |      |      ||  4.0 |      |   vmulpd    %zmm30, %zmm29, %zmm23                        #77.70
 | 
			
		||||
 241 | 0.00        |      |             |             |      | 1.00 |      |      ||  4.0 |      |   vfmadd231pd %zmm17, %zmm23, %zmm10{%k6}                 #78.17
 | 
			
		||||
 242 | 0.00        |      |             |             |      | 1.00 |      |      ||      |      |   vfmadd231pd %zmm18, %zmm23, %zmm9{%k6}                  #79.17
 | 
			
		||||
 243 | 0.00        |      |             |             |      | 1.00 |      |      ||      |  4.0 |   vfmadd231pd %zmm19, %zmm23, %zmm8{%k6}                  #80.17
 | 
			
		||||
 244 | 0.00        | 0.00 |             |             |      | 0.00 | 1.00 |      ||      |      |   cmpq      %r14, %r15                                    #59.9
 | 
			
		||||
 245 |             |      |             |             |      |      |      |      ||      |      | * jb        ..B1.16       # Prob 82%                      #59.9
 | 
			
		||||
 246 |             |      |             |             |      |      |      |      ||      |      |   # LLVM-MCA-END
 | 
			
		||||
 | 
			
		||||
       18.8          5.25   16.0   16.0   16.0   16.0          18.8   5.25           86.0    4.0  
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Loop-Carried Dependencies Analysis Report
 | 
			
		||||
-----------------------------------------
 | 
			
		||||
 243 |  4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6}                  #80.17| [243]
 | 
			
		||||
 242 |  4.0 | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6}                  #79.17| [242]
 | 
			
		||||
 241 |  4.0 | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6}                 #78.17| [241]
 | 
			
		||||
 208 |  1.0 | addq      $8, %r15                                      #59.9| [208]
 | 
			
		||||
 205 |  1.0 | vpaddd    %ymm15, %ymm4, %ymm4                          #59.9| [205]
 | 
			
		||||
 | 
			
		||||
@@ -1,77 +0,0 @@
 | 
			
		||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
 | 
			
		||||
Analyzed file:      lammps-icc-avx512.s
 | 
			
		||||
Architecture:       ICX
 | 
			
		||||
Timestamp:          2023-02-10 16:29:42
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
 | 
			
		||||
 * - Instruction micro-ops not bound to a port
 | 
			
		||||
 X - No throughput/latency information for this instruction in data file
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Combined Analysis Report
 | 
			
		||||
------------------------
 | 
			
		||||
                                                 Port pressure in cycles                                                 
 | 
			
		||||
     |  0   - 0DV  |  1   - 1DV  |  2   -  2D  |  3   -  3D  |  4   |   5   |  6   |  7   |  8   |  9   ||  CP  | LCD  |
 | 
			
		||||
------------------------------------------------------------------------------------------------------------------------
 | 
			
		||||
 200 |             |             |             |             |      |       |      |      |      |      ||      |      |   # pointer_increment=64 1303ca335e79351a96cfc07b8b9ec9d4
 | 
			
		||||
 201 |             |             |             |             |      |       |      |      |      |      ||      |      |   # LLVM-MCA-BEGIN
 | 
			
		||||
 202 |             |             |             |             |      |       |      |      |      |      ||      |      |   ..B1.16:                        # Preds ..B1.16 ..B1.15
 | 
			
		||||
 203 |             |             |             |             |      |       |      |      |      |      ||      |      |   # Execution count [2.50e+01]
 | 
			
		||||
 204 |             |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vpcmpgtd  %ymm4, %ymm3, %k5                             #59.9
 | 
			
		||||
 205 | 0.00        | 1.00        |             |             |      | 0.000 |      |      |      |      ||      |      |   vpaddd    %ymm15, %ymm4, %ymm4                          #59.9
 | 
			
		||||
 206 |             |             | 0.50   0.50 | 0.50   0.50 |      |       |      |      |      |      ||  5.0 |      |   vmovdqu32 (%r10,%r15,4), %ymm17{%k5}{z}                 #60.21
 | 
			
		||||
 207 | 0.00        | 1.00        |             |             |      | 0.000 |      |      |      |      ||  1.0 |      |   vpaddd    %ymm17, %ymm17, %ymm18                        #61.36
 | 
			
		||||
 208 | 0.00        | 0.00        |             |             |      | 0.000 | 1.00 |      |      |      ||      |      |   addq      $8, %r15                                      #59.9
 | 
			
		||||
 209 | 0.00        | 1.00        |             |             |      | 0.000 |      |      |      |      ||  1.0 |      |   vpaddd    %ymm18, %ymm17, %ymm19                        #61.36
 | 
			
		||||
 210 | 1.00        |             |             |             |      |       |      |      |      |      ||      |      |   kmovw     %k5, %k2                                      #61.36
 | 
			
		||||
 211 | 1.00        |             |             |             |      |       |      |      |      |      ||      |      |   kmovw     %k5, %k3                                      #61.36
 | 
			
		||||
 212 | 1.00        |             |             |             |      |       |      |      |      |      ||      |      |   kmovw     %k5, %k1                                      #61.36
 | 
			
		||||
 213 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vpxord    %zmm21, %zmm21, %zmm21                        #61.36
 | 
			
		||||
 214 | 0.24        |             |             |             |      | 0.760 |      |      |      |      ||      |      |   vpxord    %zmm20, %zmm20, %zmm20                        #61.36
 | 
			
		||||
 215 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vpxord    %zmm22, %zmm22, %zmm22                        #61.36
 | 
			
		||||
 216 | 0.67        | 2.33        | 7.00   7.00 | 7.00   7.00 |      | 0.000 |      |      |      |      || 24.0 |      |   vgatherdpd 8(%rbx,%ymm19,8), %zmm21{%k2}                #61.36
 | 
			
		||||
 217 | 0.67        | 2.33        | 7.00   7.00 | 7.00   7.00 |      | 0.000 |      |      |      |      ||      |      |   vgatherdpd (%rbx,%ymm19,8), %zmm20{%k3}                 #61.36
 | 
			
		||||
 218 | 0.67        | 2.33        | 7.00   7.00 | 7.00   7.00 |      | 0.000 |      |      |      |      ||      |      |   vgatherdpd 16(%rbx,%ymm19,8), %zmm22{%k1}               #61.36
 | 
			
		||||
 219 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vsubpd    %zmm21, %zmm1, %zmm18                         #62.36
 | 
			
		||||
 220 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vsubpd    %zmm20, %zmm2, %zmm17                         #61.36
 | 
			
		||||
 221 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vsubpd    %zmm22, %zmm0, %zmm19                         #63.36
 | 
			
		||||
 222 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vmulpd    %zmm18, %zmm18, %zmm31                        #64.49
 | 
			
		||||
 223 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vfmadd231pd %zmm17, %zmm17, %zmm31                      #64.49
 | 
			
		||||
 224 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vfmadd231pd %zmm19, %zmm19, %zmm31                      #64.63
 | 
			
		||||
 225 | 2.50        |             |             |             |      | 0.500 |      |      |      |      ||  6.0 |      |   vrcp14pd  %zmm31, %zmm30                                #75.39
 | 
			
		||||
 226 |             |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vcmppd    $1, %zmm14, %zmm31, %k6{%k5}                  #74.22
 | 
			
		||||
 227 |             |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vfpclasspd $30, %zmm30, %k0                             #75.39
 | 
			
		||||
 228 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmovaps   %zmm31, %zmm23                                #75.39
 | 
			
		||||
 229 | 0.50        |             | 0.50   0.50 | 0.50   0.50 |      | 0.500 |      |      |      |      ||  4.0 |      |   vfnmadd213pd .L_2il0floatpacket.5(%rip){1to8}, %zmm30, %zmm23 #75.39
 | 
			
		||||
 230 | 1.00        |             |             |             |      |       |      |      |      |      ||      |      |   knotw     %k0, %k4                                      #75.39
 | 
			
		||||
 231 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vmulpd    %zmm23, %zmm23, %zmm24                        #75.39
 | 
			
		||||
 232 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmadd213pd %zmm30, %zmm23, %zmm30{%k4}                 #75.39
 | 
			
		||||
 233 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vfmadd213pd %zmm30, %zmm24, %zmm30{%k4}                 #75.39
 | 
			
		||||
 234 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vmulpd    %zmm13, %zmm30, %zmm25                        #76.38
 | 
			
		||||
 235 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vmulpd    %zmm12, %zmm30, %zmm27                        #77.55
 | 
			
		||||
 236 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vmulpd    %zmm25, %zmm30, %zmm28                        #76.44
 | 
			
		||||
 237 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||  4.0 |      |   vmulpd    %zmm28, %zmm30, %zmm26                        #76.50
 | 
			
		||||
 238 | 0.50        |             |             |             |      | 0.500 |      |      |      |      ||      |      |   vfmsub213pd %zmm5, %zmm28, %zmm30                       #77.55
 | 
			
		||||
 239 | 0.25        |             |             |             |      | 0.750 |      |      |      |      ||  4.0 |      |   vmulpd    %zmm27, %zmm26, %zmm29                        #77.64
 | 
			
		||||
 240 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||  4.0 |      |   vmulpd    %zmm30, %zmm29, %zmm23                        #77.70
 | 
			
		||||
 241 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||  4.0 |      |   vfmadd231pd %zmm17, %zmm23, %zmm10{%k6}                 #78.17
 | 
			
		||||
 242 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |      |   vfmadd231pd %zmm18, %zmm23, %zmm9{%k6}                  #79.17
 | 
			
		||||
 243 | 0.00        |             |             |             |      | 1.000 |      |      |      |      ||      |  4.0 |   vfmadd231pd %zmm19, %zmm23, %zmm8{%k6}                  #80.17
 | 
			
		||||
 244 | 0.00        | 0.00        |             |             |      | -0.01 | 1.00 |      |      |      ||      |      |   cmpq      %r14, %r15                                    #59.9
 | 
			
		||||
 245 |             |             |             |             |      |       |      |      |      |      ||      |      | * jb        ..B1.16       # Prob 82%                      #59.9
 | 
			
		||||
 246 |             |             |             |             |      |       |      |      |      |      ||      |      |   # LLVM-MCA-END
 | 
			
		||||
 | 
			
		||||
       18.0          9.98          22.0   22.0   22.0   22.0          18.00   2.00                           89    4.0  
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Loop-Carried Dependencies Analysis Report
 | 
			
		||||
-----------------------------------------
 | 
			
		||||
 243 |  4.0 | vfmadd231pd %zmm19, %zmm23, %zmm8{%k6}                  #80.17| [243]
 | 
			
		||||
 242 |  4.0 | vfmadd231pd %zmm18, %zmm23, %zmm9{%k6}                  #79.17| [242]
 | 
			
		||||
 241 |  4.0 | vfmadd231pd %zmm17, %zmm23, %zmm10{%k6}                 #78.17| [241]
 | 
			
		||||
 208 |  1.0 | addq      $8, %r15                                      #59.9| [208]
 | 
			
		||||
 205 |  1.0 | vpaddd    %ymm15, %ymm4, %ymm4                          #59.9| [205]
 | 
			
		||||
 | 
			
		||||
@@ -1,197 +0,0 @@
 | 
			
		||||
 | 
			
		||||
[0] Code Region
 | 
			
		||||
 | 
			
		||||
Iterations:        100
 | 
			
		||||
Instructions:      7000
 | 
			
		||||
Total Cycles:      3866
 | 
			
		||||
Total uOps:        7900
 | 
			
		||||
 | 
			
		||||
Dispatch Width:    6
 | 
			
		||||
uOps Per Cycle:    2.04
 | 
			
		||||
IPC:               1.81
 | 
			
		||||
Block RThroughput: 21.5
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Instruction Info:
 | 
			
		||||
[1]: #uOps
 | 
			
		||||
[2]: Latency
 | 
			
		||||
[3]: RThroughput
 | 
			
		||||
[4]: MayLoad
 | 
			
		||||
[5]: MayStore
 | 
			
		||||
[6]: HasSideEffects (U)
 | 
			
		||||
 | 
			
		||||
[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 | 
			
		||||
 1      8     0.50    *                   vpbroadcastd	.LCPI0_1(%rip), %xmm1
 | 
			
		||||
 1      10    0.50    *                   vpmulld	(%r11,%rbp,4), %xmm1, %xmm11
 | 
			
		||||
 2      4     1.50                        vpmovsxdq	%xmm11, %ymm1
 | 
			
		||||
 1      1     0.50                        vpsllq	$3, %ymm1, %ymm1
 | 
			
		||||
 1      1     0.25                        vpaddq	%ymm1, %ymm3, %ymm1
 | 
			
		||||
 1      1     1.00                        vmovq	%xmm1, %r14
 | 
			
		||||
 2      1     1.00                        vpextrq	$1, %xmm1, %r9
 | 
			
		||||
 1      4     1.00                        vextracti128	$1, %ymm1, %xmm1
 | 
			
		||||
 1      8     0.50    *                   vmovsd	(%r14), %xmm2
 | 
			
		||||
 1      8     0.50    *                   vpsubd	.LCPI0_5, %xmm11, %xmm6
 | 
			
		||||
 2      4     1.50                        vpmovsxdq	%xmm6, %ymm6
 | 
			
		||||
 1      1     0.50                        vpsllq	$3, %ymm6, %ymm6
 | 
			
		||||
 1      1     1.00                        vmovq	%xmm1, %rdi
 | 
			
		||||
 1      1     0.25                        vpaddq	%ymm6, %ymm3, %ymm6
 | 
			
		||||
 1      1     1.00                        vmovq	%xmm6, %rcx
 | 
			
		||||
 2      1     1.00                        vpextrq	$1, %xmm1, %rbx
 | 
			
		||||
 2      1     1.00                        vpextrq	$1, %xmm6, %rax
 | 
			
		||||
 1      4     1.00                        vextracti128	$1, %ymm6, %xmm1
 | 
			
		||||
 1      8     0.50    *                   vmovsd	(%rdi), %xmm6
 | 
			
		||||
 1      1     1.00                        vmovq	%xmm1, %rdi
 | 
			
		||||
 2      1     1.00                        vpextrq	$1, %xmm1, %rsi
 | 
			
		||||
 1      8     0.50    *                   vmovsd	(%rdi), %xmm1
 | 
			
		||||
 1      8     0.50    *                   vmovsd	(%rcx), %xmm7
 | 
			
		||||
 1      8     0.50    *                   vpbroadcastd	.LCPI0_2(%rip), %xmm12
 | 
			
		||||
 1      8     0.50    *                   vmovhpd	(%r9), %xmm2, %xmm2
 | 
			
		||||
 1      1     0.25                        vpaddd	%xmm12, %xmm11, %xmm4
 | 
			
		||||
 2      4     1.50                        vpmovsxdq	%xmm4, %ymm4
 | 
			
		||||
 1      8     0.50    *                   vmovhpd	(%rax), %xmm7, %xmm7
 | 
			
		||||
 1      1     0.50                        vpsllq	$3, %ymm4, %ymm4
 | 
			
		||||
 1      1     0.25                        vpaddq	%ymm4, %ymm3, %ymm4
 | 
			
		||||
 1      8     0.50    *                   vmovhpd	(%rbx), %xmm6, %xmm6
 | 
			
		||||
 2      1     1.00                        vpextrq	$1, %xmm4, %rax
 | 
			
		||||
 1      8     0.50    *                   vmovhpd	(%rsi), %xmm1, %xmm1
 | 
			
		||||
 1      1     1.00                        vmovq	%xmm4, %rcx
 | 
			
		||||
 1      4     1.00                        vextracti128	$1, %ymm4, %xmm4
 | 
			
		||||
 1      1     1.00                        vmovq	%xmm4, %rsi
 | 
			
		||||
 1      2     1.00                        vinsertf128	$1, %xmm6, %ymm2, %ymm2
 | 
			
		||||
 2      1     1.00                        vpextrq	$1, %xmm4, %rdi
 | 
			
		||||
 1      8     0.50    *                   vmovsd	(%rsi), %xmm4
 | 
			
		||||
 1      3     0.50                        vsubpd	%ymm2, %ymm14, %ymm2
 | 
			
		||||
 1      8     0.50    *                   vmovhpd	(%rdi), %xmm4, %xmm4
 | 
			
		||||
 1      8     0.50    *                   vmovsd	(%rcx), %xmm6
 | 
			
		||||
 1      2     1.00                        vinsertf128	$1, %xmm1, %ymm7, %ymm1
 | 
			
		||||
 1      8     0.50    *                   vmovhpd	(%rax), %xmm6, %xmm6
 | 
			
		||||
 1      2     1.00                        vinsertf128	$1, %xmm4, %ymm6, %ymm4
 | 
			
		||||
 1      3     0.50                        vsubpd	%ymm1, %ymm5, %ymm1
 | 
			
		||||
 1      3     0.50                        vsubpd	%ymm4, %ymm10, %ymm4
 | 
			
		||||
 1      3     0.50                        vmulpd	%ymm2, %ymm2, %ymm6
 | 
			
		||||
 1      4     1.00                        vfmadd231pd	%ymm1, %ymm1, %ymm6
 | 
			
		||||
 1      4     1.00                        vfmadd231pd	%ymm4, %ymm4, %ymm6
 | 
			
		||||
 1      8     0.50    *                   vbroadcastsd	.LCPI0_3(%rip), %ymm7
 | 
			
		||||
 1      13    5.00                        vdivpd	%ymm6, %ymm7, %ymm7
 | 
			
		||||
 1      3     0.50                        vmulpd	%ymm7, %ymm7, %ymm11
 | 
			
		||||
 1      3     0.50                        vmulpd	%ymm9, %ymm11, %ymm11
 | 
			
		||||
 1      8     0.50    *                   vbroadcastsd	.LCPI0_4(%rip), %ymm12
 | 
			
		||||
 1      3     0.50                        vmulpd	%ymm7, %ymm11, %ymm11
 | 
			
		||||
 1      3     0.50                        vaddpd	%ymm12, %ymm11, %ymm12
 | 
			
		||||
 1      10    0.50    *                   vmulpd	128(%rsp), %ymm7, %ymm7
 | 
			
		||||
 1      3     0.50                        vmulpd	%ymm7, %ymm11, %ymm7
 | 
			
		||||
 1      3     0.50                        vmulpd	%ymm7, %ymm12, %ymm7
 | 
			
		||||
 1      1     0.50                        vcmpltpd	%ymm8, %ymm6, %ymm6
 | 
			
		||||
 1      4     1.00                        vfmadd213pd	%ymm0, %ymm7, %ymm2
 | 
			
		||||
 1      1     0.50                        vblendvpd	%ymm6, %ymm2, %ymm0, %ymm0
 | 
			
		||||
 1      4     1.00                        vfmadd213pd	%ymm15, %ymm7, %ymm1
 | 
			
		||||
 1      4     1.00                        vfmadd213pd	%ymm13, %ymm7, %ymm4
 | 
			
		||||
 1      1     0.50                        vblendvpd	%ymm6, %ymm1, %ymm15, %ymm15
 | 
			
		||||
 1      1     0.50                        vblendvpd	%ymm6, %ymm4, %ymm13, %ymm13
 | 
			
		||||
 1      1     0.25                        addq	$4, %rbp
 | 
			
		||||
 1      1     0.25                        cmpq	%rdx, %rbp
 | 
			
		||||
 1      1     0.50                        jb	.LBB0_9
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Resources:
 | 
			
		||||
[0]   - Zn3AGU0
 | 
			
		||||
[1]   - Zn3AGU1
 | 
			
		||||
[2]   - Zn3AGU2
 | 
			
		||||
[3]   - Zn3ALU0
 | 
			
		||||
[4]   - Zn3ALU1
 | 
			
		||||
[5]   - Zn3ALU2
 | 
			
		||||
[6]   - Zn3ALU3
 | 
			
		||||
[7]   - Zn3BRU1
 | 
			
		||||
[8]   - Zn3FPP0
 | 
			
		||||
[9]   - Zn3FPP1
 | 
			
		||||
[10]  - Zn3FPP2
 | 
			
		||||
[11]  - Zn3FPP3
 | 
			
		||||
[12.0] - Zn3FPP45
 | 
			
		||||
[12.1] - Zn3FPP45
 | 
			
		||||
[13]  - Zn3FPSt
 | 
			
		||||
[14.0] - Zn3LSU
 | 
			
		||||
[14.1] - Zn3LSU
 | 
			
		||||
[14.2] - Zn3LSU
 | 
			
		||||
[15.0] - Zn3Load
 | 
			
		||||
[15.1] - Zn3Load
 | 
			
		||||
[15.2] - Zn3Load
 | 
			
		||||
[16.0] - Zn3Store
 | 
			
		||||
[16.1] - Zn3Store
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Resource pressure per iteration:
 | 
			
		||||
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] 
 | 
			
		||||
 -      -      -     0.60   0.60   0.60   0.60   0.60   16.84  23.53  16.30  7.33   21.50  21.50   -     6.33   6.33   6.34   6.33   6.33   6.34    -      -     
 | 
			
		||||
 | 
			
		||||
Resource pressure by instruction:
 | 
			
		||||
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     0.03   0.97    -     0.51   0.49    -     0.34   0.33   0.33   0.34   0.33   0.33    -      -     vpbroadcastd	.LCPI0_1(%rip), %xmm1
 | 
			
		||||
 -      -      -      -      -      -      -      -     0.65    -      -     0.35   0.34   0.66    -     0.49   0.05   0.46   0.49   0.05   0.46    -      -     vpmulld	(%r11,%rbp,4), %xmm1, %xmm11
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     0.06   2.94    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxdq	%xmm11, %ymm1
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     0.65   0.35    -      -      -      -      -      -      -      -      -      -      -      -     vpsllq	$3, %ymm1, %ymm1
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vpaddq	%ymm1, %ymm3, %ymm1
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -     vmovq	%xmm1, %r14
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -     vpextrq	$1, %xmm1, %r9
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     vextracti128	$1, %ymm1, %xmm1
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.48   0.35   0.17   0.48   0.35   0.17    -      -     vmovsd	(%r14), %xmm2
 | 
			
		||||
 -      -      -      -      -      -      -      -     0.01   0.18   0.17   0.64   0.47   0.53    -     0.34   0.33   0.33   0.34   0.33   0.33    -      -     vpsubd	.LCPI0_5, %xmm11, %xmm6
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     1.92   1.08    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxdq	%xmm6, %ymm6
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     0.32   0.68    -      -      -      -      -      -      -      -      -      -      -      -     vpsllq	$3, %ymm6, %ymm6
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     1.30   0.70    -      -      -      -      -      -      -      -      -     vmovq	%xmm1, %rdi
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -     0.32   0.68    -      -      -      -      -      -      -      -      -      -      -     vpaddq	%ymm6, %ymm3, %ymm6
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     1.34   0.66    -      -      -      -      -      -      -      -      -     vmovq	%xmm6, %rcx
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -     vpextrq	$1, %xmm1, %rbx
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -     vpextrq	$1, %xmm6, %rax
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     vextracti128	$1, %ymm6, %xmm1
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.03   0.65   0.32   0.03   0.65   0.32    -      -     vmovsd	(%rdi), %xmm6
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     0.36   1.64    -      -      -      -      -      -      -      -      -     vmovq	%xmm1, %rdi
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     1.64   0.36    -      -      -      -      -      -      -      -      -     vpextrq	$1, %xmm1, %rsi
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     0.32   0.68    -     0.51   0.33   0.16   0.51   0.33   0.16    -      -     vmovsd	(%rdi), %xmm1
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     0.68   0.32    -     0.49   0.01   0.50   0.49   0.01   0.50    -      -     vmovsd	(%rcx), %xmm7
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     0.48   0.52    -     0.67   0.33    -     0.17   0.62   0.21   0.17   0.62   0.21    -      -     vpbroadcastd	.LCPI0_2(%rip), %xmm12
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     0.01   0.99    -     0.17   0.83    -     0.02   0.64   0.34   0.02   0.64   0.34    -      -     vmovhpd	(%r9), %xmm2, %xmm2
 | 
			
		||||
 -      -      -      -      -      -      -      -     0.01    -      -     0.99    -      -      -      -      -      -      -      -      -      -      -     vpaddd	%xmm12, %xmm11, %xmm4
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     0.57   2.43    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxdq	%xmm4, %ymm4
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     0.34   0.66    -     0.82   0.18    -     0.49   0.35   0.16   0.49   0.35   0.16    -      -     vmovhpd	(%rax), %xmm7, %xmm7
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     0.34   0.66    -      -      -      -      -      -      -      -      -      -      -      -     vpsllq	$3, %ymm4, %ymm4
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -     0.01   0.99    -      -      -      -      -      -      -      -      -      -      -     vpaddq	%ymm4, %ymm3, %ymm4
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     0.51   0.49    -     0.49   0.51    -     0.35   0.16   0.49   0.35   0.16   0.49    -      -     vmovhpd	(%rbx), %xmm6, %xmm6
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     1.04   0.96    -      -      -      -      -      -      -      -      -     vpextrq	$1, %xmm4, %rax
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     0.49   0.51    -     0.17   0.83    -     0.16   0.49   0.35   0.16   0.49   0.35    -      -     vmovhpd	(%rsi), %xmm1, %xmm1
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -     vmovq	%xmm4, %rcx
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     vextracti128	$1, %ymm4, %xmm4
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -     vmovq	%xmm4, %rsi
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vinsertf128	$1, %xmm6, %ymm2, %ymm2
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -     vpextrq	$1, %xmm4, %rdi
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.49   0.35   0.16   0.49   0.35   0.16    -      -     vmovsd	(%rsi), %xmm4
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -     0.31   0.69    -      -      -      -      -      -      -      -      -      -      -     vsubpd	%ymm2, %ymm14, %ymm2
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     0.49   0.51    -     0.48   0.52    -     0.35   0.16   0.49   0.35   0.16   0.49    -      -     vmovhpd	(%rdi), %xmm4, %xmm4
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -      -      -     0.52   0.48    -     0.16   0.49   0.35   0.16   0.49   0.35    -      -     vmovsd	(%rcx), %xmm6
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vinsertf128	$1, %xmm1, %ymm7, %ymm1
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     0.35   0.65    -     0.50   0.50    -     0.47   0.35   0.18   0.47   0.35   0.18    -      -     vmovhpd	(%rax), %xmm6, %xmm6
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vinsertf128	$1, %xmm4, %ymm6, %ymm4
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -     0.33   0.67    -      -      -      -      -      -      -      -      -      -      -     vsubpd	%ymm1, %ymm5, %ymm1
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -     0.51   0.49    -      -      -      -      -      -      -      -      -      -      -     vsubpd	%ymm4, %ymm10, %ymm4
 | 
			
		||||
 -      -      -      -      -      -      -      -     0.52   0.48    -      -      -      -      -      -      -      -      -      -      -      -      -     vmulpd	%ymm2, %ymm2, %ymm6
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd231pd	%ymm1, %ymm1, %ymm6
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.34   0.66    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd231pd	%ymm4, %ymm4, %ymm6
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     0.66   0.34    -     0.51   0.49    -     0.19   0.32   0.49   0.19   0.32   0.49    -      -     vbroadcastsd	.LCPI0_3(%rip), %ymm7
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     5.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vdivpd	%ymm6, %ymm7, %ymm7
 | 
			
		||||
 -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vmulpd	%ymm7, %ymm7, %ymm11
 | 
			
		||||
 -      -      -      -      -      -      -      -     0.99   0.01    -      -      -      -      -      -      -      -      -      -      -      -      -     vmulpd	%ymm9, %ymm11, %ymm11
 | 
			
		||||
 -      -      -      -      -      -      -      -      -     0.30   0.70    -     0.49   0.51    -     0.34   0.33   0.33   0.34   0.33   0.33    -      -     vbroadcastsd	.LCPI0_4(%rip), %ymm12
 | 
			
		||||
 -      -      -      -      -      -      -      -     0.82   0.18    -      -      -      -      -      -      -      -      -      -      -      -      -     vmulpd	%ymm7, %ymm11, %ymm11
 | 
			
		||||
 -      -      -      -      -      -      -      -      -      -     0.17   0.83    -      -      -      -      -      -      -      -      -      -      -     vaddpd	%ymm12, %ymm11, %ymm12
 | 
			
		||||
 -      -      -      -      -      -      -      -     0.01   0.99    -      -     0.18   0.82    -     0.46   0.02   0.52   0.46   0.02   0.52    -      -     vmulpd	128(%rsp), %ymm7, %ymm7
 | 
			
		||||
 -      -      -      -      -      -      -      -     0.99   0.01    -      -      -      -      -      -      -      -      -      -      -      -      -     vmulpd	%ymm7, %ymm11, %ymm7
 | 
			
		||||
 -      -      -      -      -      -      -      -     0.67   0.33    -      -      -      -      -      -      -      -      -      -      -      -      -     vmulpd	%ymm7, %ymm12, %ymm7
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -     vcmpltpd	%ymm8, %ymm6, %ymm6
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.34   0.66    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd213pd	%ymm0, %ymm7, %ymm2
 | 
			
		||||
 -      -      -      -      -      -      -      -     0.66   0.34    -      -      -      -      -      -      -      -      -      -      -      -      -     vblendvpd	%ymm6, %ymm2, %ymm0, %ymm0
 | 
			
		||||
 -      -      -      -      -      -      -      -     0.66   1.34    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd213pd	%ymm15, %ymm7, %ymm1
 | 
			
		||||
 -      -      -      -      -      -      -      -     1.34   0.66    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd213pd	%ymm13, %ymm7, %ymm4
 | 
			
		||||
 -      -      -      -      -      -      -      -     0.34   0.66    -      -      -      -      -      -      -      -      -      -      -      -      -     vblendvpd	%ymm6, %ymm1, %ymm15, %ymm15
 | 
			
		||||
 -      -      -      -      -      -      -      -     0.99   0.01    -      -      -      -      -      -      -      -      -      -      -      -      -     vblendvpd	%ymm6, %ymm4, %ymm13, %ymm13
 | 
			
		||||
 -      -      -      -     0.40   0.20   0.40    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     addq	$4, %rbp
 | 
			
		||||
 -      -      -     0.20   0.20   0.40   0.20    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpq	%rdx, %rbp
 | 
			
		||||
 -      -      -     0.40    -      -      -     0.60    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     jb	.LBB0_9
 | 
			
		||||
@@ -1,108 +0,0 @@
 | 
			
		||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
 | 
			
		||||
Analyzed file:      lammps-icx-avx2zen.s
 | 
			
		||||
Architecture:       ZEN3
 | 
			
		||||
Timestamp:          2023-02-10 16:31:30
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
 | 
			
		||||
 * - Instruction micro-ops not bound to a port
 | 
			
		||||
 X - No throughput/latency information for this instruction in data file
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Combined Analysis Report
 | 
			
		||||
------------------------
 | 
			
		||||
                                                           Port pressure in cycles                                                           
 | 
			
		||||
     |  0   |  1   |  2   |  3   | DV0  | DV1  |  4   |  5   |  6   |  7   |  8   - 8DV  |  9   |  10  |  11  |  12  |  13  ||  CP  | LCD  |
 | 
			
		||||
--------------------------------------------------------------------------------------------------------------------------------------------
 | 
			
		||||
 175 |      |      |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   # pointer_increment=16 e95035fc9e97f63299dd5188a0872bfc
 | 
			
		||||
 176 |      |      |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   # LLVM-MCA-BEGIN
 | 
			
		||||
 177 |      |      |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   .LBB0_9:                                #
 | 
			
		||||
 178 |      |      |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   #   Parent Loop BB0_6 Depth=1
 | 
			
		||||
 179 |      |      |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   # =>  This Inner Loop Header: Depth=2
 | 
			
		||||
 180 |      | 0.00 | 1.00 |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||  1.0 |      |   vpbroadcastd .LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3]
 | 
			
		||||
 181 | 0.00 |      |      | 1.00 |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||  3.0 |      |   vpmulld (%r11,%rbp,4), %xmm1, %xmm11
 | 
			
		||||
 182 | 0.00 | 0.75 | 0.38 | 0.87 |      |      |      |      |      |      |             |      |      |      |      |      ||  4.0 |      |   vpmovsxdq %xmm11, %ymm1
 | 
			
		||||
 183 |      | 0.00 | 1.00 |      |      |      |      |      |      |      |             |      |      |      |      |      ||  1.0 |      |   vpsllq $3, %ymm1, %ymm1
 | 
			
		||||
 184 | 0.00 | 0.00 | 0.50 | 0.50 |      |      |      |      |      |      |             |      |      |      |      |      ||  1.0 |      |   vpaddq %ymm1, %ymm3, %ymm1
 | 
			
		||||
 185 | 0.00 | 0.00 | 0.50 | 0.50 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vmovq %xmm1, %r14
 | 
			
		||||
 186 | 0.12 | 1.88 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpextrq $1, %xmm1, %r9
 | 
			
		||||
 187 |      | 1.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  3.0 |      |   vextracti128 $1, %ymm1, %xmm1
 | 
			
		||||
 188 |      |      |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovsd (%r14), %xmm2           # xmm2 = mem[0],zero
 | 
			
		||||
 189 | 0.00 | 0.00 | 0.50 | 0.50 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpsubd .LCPI0_5, %xmm11, %xmm6
 | 
			
		||||
 190 | 0.00 | 0.75 | 0.38 | 0.87 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpmovsxdq %xmm6, %ymm6
 | 
			
		||||
 191 |      | 0.00 | 1.00 |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpsllq $3, %ymm6, %ymm6
 | 
			
		||||
 192 | 0.00 | 0.00 | 0.49 | 0.51 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vmovq %xmm1, %rdi
 | 
			
		||||
 193 | 0.00 | 0.00 | 0.51 | 0.49 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpaddq %ymm6, %ymm3, %ymm6
 | 
			
		||||
 194 | 0.00 | 0.00 | 0.49 | 0.51 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vmovq %xmm6, %rcx
 | 
			
		||||
 195 | 0.13 | 1.87 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  6.0 |      |   vpextrq $1, %xmm1, %rbx
 | 
			
		||||
 196 | 0.00 | 2.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpextrq $1, %xmm6, %rax
 | 
			
		||||
 197 |      | 1.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vextracti128 $1, %ymm6, %xmm1
 | 
			
		||||
 198 |      |      |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovsd (%rdi), %xmm6           # xmm6 = mem[0],zero
 | 
			
		||||
 199 | 0.00 | 0.00 | 0.00 | 1.00 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vmovq %xmm1, %rdi
 | 
			
		||||
 200 | 0.00 | 2.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpextrq $1, %xmm1, %rsi
 | 
			
		||||
 201 |      |      |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovsd (%rdi), %xmm1           # xmm1 = mem[0],zero
 | 
			
		||||
 202 |      |      |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovsd (%rcx), %xmm7           # xmm7 = mem[0],zero
 | 
			
		||||
 203 |      | 0.00 | 1.00 |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vpbroadcastd .LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2]
 | 
			
		||||
 204 |      | 0.00 | 1.00 |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovhpd (%r9), %xmm2, %xmm2     # xmm2 = xmm2[0],mem[0]
 | 
			
		||||
 205 | 0.00 | 0.00 | 0.63 | 0.37 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpaddd %xmm12, %xmm11, %xmm4
 | 
			
		||||
 206 | 0.00 | 0.75 | 0.00 | 1.25 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpmovsxdq %xmm4, %ymm4
 | 
			
		||||
 207 |      | 0.00 | 1.00 |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovhpd (%rax), %xmm7, %xmm7    # xmm7 = xmm7[0],mem[0]
 | 
			
		||||
 208 |      | 0.00 | 1.00 |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpsllq $3, %ymm4, %ymm4
 | 
			
		||||
 209 | 0.00 | 0.00 | 0.00 | 1.00 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpaddq %ymm4, %ymm3, %ymm4
 | 
			
		||||
 210 |      | 0.00 | 1.00 |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||  5.0 |      |   vmovhpd (%rbx), %xmm6, %xmm6    # xmm6 = xmm6[0],mem[0]
 | 
			
		||||
 211 | 0.75 | 1.25 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpextrq $1, %xmm4, %rax
 | 
			
		||||
 212 |      | 0.00 | 1.00 |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovhpd (%rsi), %xmm1, %xmm1    # xmm1 = xmm1[0],mem[0]
 | 
			
		||||
 213 | 0.00 | 0.00 | 0.50 | 0.50 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vmovq %xmm4, %rcx
 | 
			
		||||
 214 |      | 1.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vextracti128 $1, %ymm4, %xmm4
 | 
			
		||||
 215 | 0.00 | 0.00 | 0.00 | 1.00 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vmovq %xmm4, %rsi
 | 
			
		||||
 216 |      | 1.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  1.0 |      |   vinsertf128 $1, %xmm6, %ymm2, %ymm2
 | 
			
		||||
 217 | 1.00 | 1.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpextrq $1, %xmm4, %rdi
 | 
			
		||||
 218 |      |      |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovsd (%rsi), %xmm4           # xmm4 = mem[0],zero
 | 
			
		||||
 219 |      |      | 0.00 | 1.00 |      |      |      |      |      |      |             |      |      |      |      |      ||  3.0 |      |   vsubpd %ymm2, %ymm14, %ymm2
 | 
			
		||||
 220 |      | 0.00 | 1.00 |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovhpd (%rdi), %xmm4, %xmm4    # xmm4 = xmm4[0],mem[0]
 | 
			
		||||
 221 |      |      |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovsd (%rcx), %xmm6           # xmm6 = mem[0],zero
 | 
			
		||||
 222 |      | 1.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vinsertf128 $1, %xmm1, %ymm7, %ymm1
 | 
			
		||||
 223 |      | 0.00 | 1.00 |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovhpd (%rax), %xmm6, %xmm6    # xmm6 = xmm6[0],mem[0]
 | 
			
		||||
 224 |      | 1.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vinsertf128 $1, %xmm4, %ymm6, %ymm4
 | 
			
		||||
 225 |      |      | 0.00 | 1.00 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vsubpd %ymm1, %ymm5, %ymm1
 | 
			
		||||
 226 |      |      | 0.00 | 1.00 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vsubpd %ymm4, %ymm10, %ymm4
 | 
			
		||||
 227 | 1.00 | 0.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  3.0 |      |   vmulpd %ymm2, %ymm2, %ymm6
 | 
			
		||||
 228 | 1.00 | 0.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  4.0 |      |   vfmadd231pd %ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6
 | 
			
		||||
 229 | 1.00 | 0.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  4.0 |      |   vfmadd231pd %ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6
 | 
			
		||||
 230 | 1.00 |      |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vbroadcastsd .LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 | 
			
		||||
 231 |      |      |      |      | 4.50 | 4.50 |      |      |      |      |             |      |      |      |      |      || 13.0 |      |   vdivpd %ymm6, %ymm7, %ymm7
 | 
			
		||||
 232 | 1.00 | 0.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  3.0 |      |   vmulpd %ymm7, %ymm7, %ymm11
 | 
			
		||||
 233 | 1.00 | 0.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  3.0 |      |   vmulpd %ymm9, %ymm11, %ymm11
 | 
			
		||||
 234 | 1.00 |      |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vbroadcastsd .LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
 | 
			
		||||
 235 | 1.00 | 0.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  3.0 |      |   vmulpd %ymm7, %ymm11, %ymm11
 | 
			
		||||
 236 |      |      | 0.00 | 1.00 |      |      |      |      |      |      |             |      |      |      |      |      ||  3.0 |      |   vaddpd %ymm12, %ymm11, %ymm12
 | 
			
		||||
 237 | 1.00 | 0.00 |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmulpd 128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
 | 
			
		||||
 238 | 1.00 | 0.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vmulpd %ymm7, %ymm11, %ymm7
 | 
			
		||||
 239 | 1.00 | 0.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  3.0 |      |   vmulpd %ymm7, %ymm12, %ymm7
 | 
			
		||||
 240 |      |      | 0.00 | 1.00 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vcmpltpd %ymm8, %ymm6, %ymm6
 | 
			
		||||
 241 | 1.00 | 0.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  4.0 |      |   vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0
 | 
			
		||||
 242 | 1.00 | 0.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  1.0 |      |   vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
 | 
			
		||||
 243 | 1.00 | 0.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15
 | 
			
		||||
 244 | 1.00 | 0.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |  4.0 |   vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13
 | 
			
		||||
 245 | 1.00 | 0.00 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
 | 
			
		||||
 246 | 0.75 | 0.25 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |  1.0 |   vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
 | 
			
		||||
 247 |      |      |      |      |      |      |      |      | 0.25 | 0.25 | 0.25        | 0.25 |      |      |      |      ||      |      |   addq $4, %rbp
 | 
			
		||||
 248 |      |      |      |      |      |      |      |      | 0.25 | 0.25 | 0.25        | 0.25 |      |      |      |      ||      |      |   cmpq %rdx, %rbp
 | 
			
		||||
 249 |      |      |      |      |      |      |      |      | 0.00 |      |             |      | 1.00 |      |      |      ||      |      |   jb .LBB0_9
 | 
			
		||||
 250 |      |      |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   # LLVM-MCA-END
 | 
			
		||||
 | 
			
		||||
       18.8   18.5   15.9   15.9   4.50   4.50                 0.50   0.50   0.50          0.50          9.00   9.00             72    5.0  
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Loop-Carried Dependencies Analysis Report
 | 
			
		||||
-----------------------------------------
 | 
			
		||||
 244 |  5.0 | vfmadd213pd	%ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13| [244, 246]
 | 
			
		||||
 243 |  5.0 | vfmadd213pd	%ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15| [243, 245]
 | 
			
		||||
 241 |  5.0 | vfmadd213pd	%ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0| [241, 242]
 | 
			
		||||
 247 |  1.0 | addq	$4, %rbp                       | [247]
 | 
			
		||||
 246 |  1.0 | vblendvpd	%ymm6, %ymm4, %ymm13, %ymm13| [246]
 | 
			
		||||
 245 |  1.0 | vblendvpd	%ymm6, %ymm1, %ymm15, %ymm15| [245]
 | 
			
		||||
 242 |  1.0 | vblendvpd	%ymm6, %ymm2, %ymm0, %ymm0| [242]
 | 
			
		||||
 | 
			
		||||
										
											Binary file not shown.
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
										
											Binary file not shown.
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
										
											Binary file not shown.
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
										
											Binary file not shown.
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
										
											Binary file not shown.
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
										
											Binary file not shown.
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
										
											Binary file not shown.
										
									
								
							@@ -1,640 +0,0 @@
 | 
			
		||||
	.text
 | 
			
		||||
	.file	"force_lj.c"
 | 
			
		||||
	.section	.rodata.cst8,"aM",@progbits,8
 | 
			
		||||
	.p2align	3               # -- Begin function computeForceLJFullNeigh_plain_c
 | 
			
		||||
.LCPI0_0:
 | 
			
		||||
	.quad	4631952216750555136     #  48
 | 
			
		||||
.LCPI0_3:
 | 
			
		||||
	.quad	4607182418800017408     #  1
 | 
			
		||||
.LCPI0_4:
 | 
			
		||||
	.quad	-4620693217682128896    #  -0.5
 | 
			
		||||
	.section	.rodata.cst4,"aM",@progbits,4
 | 
			
		||||
	.p2align	2
 | 
			
		||||
.LCPI0_1:
 | 
			
		||||
	.long	3                       # 0x3
 | 
			
		||||
.LCPI0_2:
 | 
			
		||||
	.long	2                       # 0x2
 | 
			
		||||
	.section	.rodata.cst16,"aM",@progbits,16
 | 
			
		||||
	.p2align	4
 | 
			
		||||
.LCPI0_5:
 | 
			
		||||
	.zero	16,255
 | 
			
		||||
	.text
 | 
			
		||||
	.globl	computeForceLJFullNeigh_plain_c
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
	.type	computeForceLJFullNeigh_plain_c,@function
 | 
			
		||||
computeForceLJFullNeigh_plain_c:        # 
 | 
			
		||||
.LcomputeForceLJFullNeigh_plain_c$local:
 | 
			
		||||
	.cfi_startproc
 | 
			
		||||
# %bb.0:                                # 
 | 
			
		||||
	pushq	%rbp
 | 
			
		||||
	.cfi_def_cfa_offset 16
 | 
			
		||||
	pushq	%r15
 | 
			
		||||
	.cfi_def_cfa_offset 24
 | 
			
		||||
	pushq	%r14
 | 
			
		||||
	.cfi_def_cfa_offset 32
 | 
			
		||||
	pushq	%r13
 | 
			
		||||
	.cfi_def_cfa_offset 40
 | 
			
		||||
	pushq	%r12
 | 
			
		||||
	.cfi_def_cfa_offset 48
 | 
			
		||||
	pushq	%rbx
 | 
			
		||||
	.cfi_def_cfa_offset 56
 | 
			
		||||
	subq	$264, %rsp              # imm = 0x108
 | 
			
		||||
	.cfi_def_cfa_offset 320
 | 
			
		||||
	.cfi_offset %rbx, -56
 | 
			
		||||
	.cfi_offset %r12, -48
 | 
			
		||||
	.cfi_offset %r13, -40
 | 
			
		||||
	.cfi_offset %r14, -32
 | 
			
		||||
	.cfi_offset %r15, -24
 | 
			
		||||
	.cfi_offset %rbp, -16
 | 
			
		||||
	movq	%rcx, %rbx
 | 
			
		||||
	movq	%rdx, %r15
 | 
			
		||||
	movq	%rsi, %r12
 | 
			
		||||
	movl	4(%rsi), %r14d
 | 
			
		||||
	vmovsd	144(%rdi), %xmm0        # xmm0 = mem[0],zero
 | 
			
		||||
	vmovsd	%xmm0, (%rsp)           # 8-byte Spill
 | 
			
		||||
	vmovsd	40(%rdi), %xmm0         # xmm0 = mem[0],zero
 | 
			
		||||
	vmovsd	%xmm0, 128(%rsp)        # 8-byte Spill
 | 
			
		||||
	vmovq	56(%rdi), %xmm0         # xmm0 = mem[0],zero
 | 
			
		||||
	vmovdqa	%xmm0, 80(%rsp)         # 16-byte Spill
 | 
			
		||||
	testl	%r14d, %r14d
 | 
			
		||||
	jle	.LBB0_2
 | 
			
		||||
# %bb.1:                                # 
 | 
			
		||||
	movq	64(%r12), %rdi
 | 
			
		||||
	leaq	(,%r14,8), %rax
 | 
			
		||||
	leaq	(%rax,%rax,2), %rdx
 | 
			
		||||
	xorl	%esi, %esi
 | 
			
		||||
	callq	_intel_fast_memset
 | 
			
		||||
.LBB0_2:                                # 
 | 
			
		||||
	xorl	%eax, %eax
 | 
			
		||||
	callq	getTimeStamp
 | 
			
		||||
	vmovq	%xmm0, 32(%rsp)         # 8-byte Folded Spill
 | 
			
		||||
	movl	$.L.str, %edi
 | 
			
		||||
	callq	likwid_markerStartRegion
 | 
			
		||||
	testl	%r14d, %r14d
 | 
			
		||||
	jle	.LBB0_19
 | 
			
		||||
# %bb.3:                                # 
 | 
			
		||||
	vmovsd	(%rsp), %xmm0           # 8-byte Reload
 | 
			
		||||
                                        # xmm0 = mem[0],zero
 | 
			
		||||
	vmulsd	%xmm0, %xmm0, %xmm13
 | 
			
		||||
	movq	16(%r15), %r11
 | 
			
		||||
	movq	24(%r15), %rsi
 | 
			
		||||
	movslq	8(%r15), %rdi
 | 
			
		||||
	movq	16(%r12), %r15
 | 
			
		||||
	movq	64(%r12), %r8
 | 
			
		||||
	vmovsd	128(%rsp), %xmm0        # 8-byte Reload
 | 
			
		||||
                                        # xmm0 = mem[0],zero
 | 
			
		||||
	vmulsd	.LCPI0_0(%rip), %xmm0, %xmm15
 | 
			
		||||
	movq	%rbx, 24(%rsp)          # 8-byte Spill
 | 
			
		||||
	vmovdqu	(%rbx), %xmm14
 | 
			
		||||
	decq	%r14
 | 
			
		||||
	vmovq	%r15, %xmm0
 | 
			
		||||
	vpbroadcastq	%xmm0, %ymm3
 | 
			
		||||
	vbroadcastsd	%xmm13, %ymm2
 | 
			
		||||
	vmovapd	80(%rsp), %xmm12        # 16-byte Reload
 | 
			
		||||
	vbroadcastsd	%xmm12, %ymm8
 | 
			
		||||
	vbroadcastsd	%xmm15, %ymm9
 | 
			
		||||
	shlq	$2, %rdi
 | 
			
		||||
	xorl	%r10d, %r10d
 | 
			
		||||
	movq	%r14, 56(%rsp)          # 8-byte Spill
 | 
			
		||||
	vmovapd	%xmm13, 192(%rsp)       # 16-byte Spill
 | 
			
		||||
	movq	%rsi, 48(%rsp)          # 8-byte Spill
 | 
			
		||||
	movq	%rdi, 40(%rsp)          # 8-byte Spill
 | 
			
		||||
	vmovapd	%xmm15, 176(%rsp)       # 16-byte Spill
 | 
			
		||||
	vmovupd	%ymm2, 224(%rsp)        # 32-byte Spill
 | 
			
		||||
	vmovupd	%ymm9, 128(%rsp)        # 32-byte Spill
 | 
			
		||||
	jmp	.LBB0_6
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
.LBB0_17:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_6 Depth=1
 | 
			
		||||
	movq	%r13, %rdx
 | 
			
		||||
.LBB0_5:                                # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_6 Depth=1
 | 
			
		||||
	vaddsd	(%r8,%r12,8), %xmm10, %xmm0
 | 
			
		||||
	vmovsd	%xmm0, (%r8,%r12,8)
 | 
			
		||||
	vaddsd	(%r8,%rbx,8), %xmm11, %xmm0
 | 
			
		||||
	vmovsd	%xmm0, (%r8,%rbx,8)
 | 
			
		||||
	vaddsd	(%r8,%rbp,8), %xmm5, %xmm0
 | 
			
		||||
	vmovsd	%xmm0, (%r8,%rbp,8)
 | 
			
		||||
	leal	3(%r13), %eax
 | 
			
		||||
	addl	$6, %r13d
 | 
			
		||||
	testl	%eax, %eax
 | 
			
		||||
	cmovnsl	%eax, %r13d
 | 
			
		||||
	sarl	$2, %r13d
 | 
			
		||||
	movslq	%r13d, %rax
 | 
			
		||||
	vmovq	%rax, %xmm0
 | 
			
		||||
	vmovq	%rdx, %xmm1
 | 
			
		||||
	vpunpcklqdq	%xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
 | 
			
		||||
	vpaddq	%xmm0, %xmm14, %xmm14
 | 
			
		||||
	addq	%rdi, %r11
 | 
			
		||||
	cmpq	%r14, %r10
 | 
			
		||||
	leaq	1(%r10), %r10
 | 
			
		||||
	je	.LBB0_18
 | 
			
		||||
.LBB0_6:                                # 
 | 
			
		||||
                                        # =>This Loop Header: Depth=1
 | 
			
		||||
                                        #     Child Loop BB0_9 Depth 2
 | 
			
		||||
                                        #     Child Loop BB0_13 Depth 2
 | 
			
		||||
	movl	(%rsi,%r10,4), %r13d
 | 
			
		||||
	leal	(%r10,%r10,2), %r12d
 | 
			
		||||
	leal	(%r10,%r10,2), %ebx
 | 
			
		||||
	incl	%ebx
 | 
			
		||||
	leal	(%r10,%r10,2), %ebp
 | 
			
		||||
	addl	$2, %ebp
 | 
			
		||||
	testl	%r13d, %r13d
 | 
			
		||||
	jle	.LBB0_4
 | 
			
		||||
# %bb.7:                                # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_6 Depth=1
 | 
			
		||||
	vmovsd	(%r15,%r12,8), %xmm0    # xmm0 = mem[0],zero
 | 
			
		||||
	vmovsd	(%r15,%rbx,8), %xmm1    # xmm1 = mem[0],zero
 | 
			
		||||
	vmovsd	(%r15,%rbp,8), %xmm2    # xmm2 = mem[0],zero
 | 
			
		||||
	movq	%r13, %rdx
 | 
			
		||||
	movl	$4294967292, %eax       # imm = 0xFFFFFFFC
 | 
			
		||||
	andq	%rax, %rdx
 | 
			
		||||
	vmovapd	%xmm0, 112(%rsp)        # 16-byte Spill
 | 
			
		||||
	vmovapd	%xmm1, 96(%rsp)         # 16-byte Spill
 | 
			
		||||
	vmovapd	%xmm2, (%rsp)           # 16-byte Spill
 | 
			
		||||
	je	.LBB0_16
 | 
			
		||||
# %bb.8:                                # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_6 Depth=1
 | 
			
		||||
	movq	%rbp, 64(%rsp)          # 8-byte Spill
 | 
			
		||||
	movq	%rbx, 72(%rsp)          # 8-byte Spill
 | 
			
		||||
	vmovdqa	%xmm14, 208(%rsp)       # 16-byte Spill
 | 
			
		||||
	vbroadcastsd	%xmm0, %ymm14
 | 
			
		||||
	vbroadcastsd	%xmm1, %ymm5
 | 
			
		||||
	vbroadcastsd	%xmm2, %ymm10
 | 
			
		||||
	vxorpd	%xmm0, %xmm0, %xmm0
 | 
			
		||||
	vxorpd	%xmm15, %xmm15, %xmm15
 | 
			
		||||
	vxorpd	%xmm13, %xmm13, %xmm13
 | 
			
		||||
	xorl	%ebp, %ebp
 | 
			
		||||
	vmovapd	%ymm8, %ymm9
 | 
			
		||||
	vmovupd	224(%rsp), %ymm8        # 32-byte Reload
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
movl      $111, %ebx # OSACA START MARKER
 | 
			
		||||
.byte     100        # OSACA START MARKER
 | 
			
		||||
.byte     103        # OSACA START MARKER
 | 
			
		||||
.byte     144        # OSACA START MARKER
 | 
			
		||||
# pointer_increment=16 e95035fc9e97f63299dd5188a0872bfc
 | 
			
		||||
# LLVM-MCA-BEGIN
 | 
			
		||||
.LBB0_9:                                # 
 | 
			
		||||
                                        #   Parent Loop BB0_6 Depth=1
 | 
			
		||||
                                        # =>  This Inner Loop Header: Depth=2
 | 
			
		||||
	vpbroadcastd	.LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3]
 | 
			
		||||
	vpmulld	(%r11,%rbp,4), %xmm1, %xmm11
 | 
			
		||||
	vpmovsxdq	%xmm11, %ymm1
 | 
			
		||||
	vpsllq	$3, %ymm1, %ymm1
 | 
			
		||||
	vpaddq	%ymm1, %ymm3, %ymm1
 | 
			
		||||
	vmovq	%xmm1, %r14
 | 
			
		||||
	vpextrq	$1, %xmm1, %r9
 | 
			
		||||
	vextracti128	$1, %ymm1, %xmm1
 | 
			
		||||
	vmovsd	(%r14), %xmm2           # xmm2 = mem[0],zero
 | 
			
		||||
	vpsubd	.LCPI0_5, %xmm11, %xmm6
 | 
			
		||||
	vpmovsxdq	%xmm6, %ymm6
 | 
			
		||||
	vpsllq	$3, %ymm6, %ymm6
 | 
			
		||||
	vmovq	%xmm1, %rdi
 | 
			
		||||
	vpaddq	%ymm6, %ymm3, %ymm6
 | 
			
		||||
	vmovq	%xmm6, %rcx
 | 
			
		||||
	vpextrq	$1, %xmm1, %rbx
 | 
			
		||||
	vpextrq	$1, %xmm6, %rax
 | 
			
		||||
	vextracti128	$1, %ymm6, %xmm1
 | 
			
		||||
	vmovsd	(%rdi), %xmm6           # xmm6 = mem[0],zero
 | 
			
		||||
	vmovq	%xmm1, %rdi
 | 
			
		||||
	vpextrq	$1, %xmm1, %rsi
 | 
			
		||||
	vmovsd	(%rdi), %xmm1           # xmm1 = mem[0],zero
 | 
			
		||||
	vmovsd	(%rcx), %xmm7           # xmm7 = mem[0],zero
 | 
			
		||||
	vpbroadcastd	.LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2]
 | 
			
		||||
	vmovhpd	(%r9), %xmm2, %xmm2     # xmm2 = xmm2[0],mem[0]
 | 
			
		||||
	vpaddd	%xmm12, %xmm11, %xmm4
 | 
			
		||||
	vpmovsxdq	%xmm4, %ymm4
 | 
			
		||||
	vmovhpd	(%rax), %xmm7, %xmm7    # xmm7 = xmm7[0],mem[0]
 | 
			
		||||
	vpsllq	$3, %ymm4, %ymm4
 | 
			
		||||
	vpaddq	%ymm4, %ymm3, %ymm4
 | 
			
		||||
	vmovhpd	(%rbx), %xmm6, %xmm6    # xmm6 = xmm6[0],mem[0]
 | 
			
		||||
	vpextrq	$1, %xmm4, %rax
 | 
			
		||||
	vmovhpd	(%rsi), %xmm1, %xmm1    # xmm1 = xmm1[0],mem[0]
 | 
			
		||||
	vmovq	%xmm4, %rcx
 | 
			
		||||
	vextracti128	$1, %ymm4, %xmm4
 | 
			
		||||
	vmovq	%xmm4, %rsi
 | 
			
		||||
	vinsertf128	$1, %xmm6, %ymm2, %ymm2
 | 
			
		||||
	vpextrq	$1, %xmm4, %rdi
 | 
			
		||||
	vmovsd	(%rsi), %xmm4           # xmm4 = mem[0],zero
 | 
			
		||||
	vsubpd	%ymm2, %ymm14, %ymm2
 | 
			
		||||
	vmovhpd	(%rdi), %xmm4, %xmm4    # xmm4 = xmm4[0],mem[0]
 | 
			
		||||
	vmovsd	(%rcx), %xmm6           # xmm6 = mem[0],zero
 | 
			
		||||
	vinsertf128	$1, %xmm1, %ymm7, %ymm1
 | 
			
		||||
	vmovhpd	(%rax), %xmm6, %xmm6    # xmm6 = xmm6[0],mem[0]
 | 
			
		||||
	vinsertf128	$1, %xmm4, %ymm6, %ymm4
 | 
			
		||||
	vsubpd	%ymm1, %ymm5, %ymm1
 | 
			
		||||
	vsubpd	%ymm4, %ymm10, %ymm4
 | 
			
		||||
	vmulpd	%ymm2, %ymm2, %ymm6
 | 
			
		||||
	vfmadd231pd	%ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6
 | 
			
		||||
	vfmadd231pd	%ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6
 | 
			
		||||
	vbroadcastsd	.LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 | 
			
		||||
	vdivpd	%ymm6, %ymm7, %ymm7
 | 
			
		||||
	vmulpd	%ymm7, %ymm7, %ymm11
 | 
			
		||||
	vmulpd	%ymm9, %ymm11, %ymm11
 | 
			
		||||
	vbroadcastsd	.LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
 | 
			
		||||
	vmulpd	%ymm7, %ymm11, %ymm11
 | 
			
		||||
	vaddpd	%ymm12, %ymm11, %ymm12
 | 
			
		||||
	vmulpd	128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
 | 
			
		||||
	vmulpd	%ymm7, %ymm11, %ymm7
 | 
			
		||||
	vmulpd	%ymm7, %ymm12, %ymm7
 | 
			
		||||
	vcmpltpd	%ymm8, %ymm6, %ymm6
 | 
			
		||||
	vfmadd213pd	%ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0
 | 
			
		||||
	vblendvpd	%ymm6, %ymm2, %ymm0, %ymm0
 | 
			
		||||
	vfmadd213pd	%ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15
 | 
			
		||||
	vfmadd213pd	%ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13
 | 
			
		||||
	vblendvpd	%ymm6, %ymm1, %ymm15, %ymm15
 | 
			
		||||
	vblendvpd	%ymm6, %ymm4, %ymm13, %ymm13
 | 
			
		||||
	addq	$4, %rbp
 | 
			
		||||
	cmpq	%rdx, %rbp
 | 
			
		||||
	jb	.LBB0_9
 | 
			
		||||
# LLVM-MCA-END
 | 
			
		||||
movl      $222, %ebx # OSACA END MARKER
 | 
			
		||||
.byte     100        # OSACA END MARKER
 | 
			
		||||
.byte     103        # OSACA END MARKER
 | 
			
		||||
.byte     144        # OSACA END MARKER
 | 
			
		||||
# %bb.10:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_6 Depth=1
 | 
			
		||||
	vpermilpd	$1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
 | 
			
		||||
	vaddsd	%xmm1, %xmm0, %xmm1
 | 
			
		||||
	vextractf128	$1, %ymm0, %xmm0
 | 
			
		||||
	vaddsd	%xmm0, %xmm1, %xmm1
 | 
			
		||||
	vpermilpd	$1, %xmm0, %xmm0 # xmm0 = xmm0[1,0]
 | 
			
		||||
	vaddsd	%xmm0, %xmm1, %xmm10
 | 
			
		||||
	vpermilpd	$1, %xmm15, %xmm1 # xmm1 = xmm15[1,0]
 | 
			
		||||
	vaddsd	%xmm1, %xmm15, %xmm1
 | 
			
		||||
	vextractf128	$1, %ymm15, %xmm2
 | 
			
		||||
	vaddsd	%xmm2, %xmm1, %xmm1
 | 
			
		||||
	vpermilpd	$1, %xmm2, %xmm2 # xmm2 = xmm2[1,0]
 | 
			
		||||
	vaddsd	%xmm2, %xmm1, %xmm11
 | 
			
		||||
	vpermilpd	$1, %xmm13, %xmm1 # xmm1 = xmm13[1,0]
 | 
			
		||||
	vaddsd	%xmm1, %xmm13, %xmm1
 | 
			
		||||
	vextractf128	$1, %ymm13, %xmm2
 | 
			
		||||
	vaddsd	%xmm2, %xmm1, %xmm1
 | 
			
		||||
	vpermilpd	$1, %xmm2, %xmm2 # xmm2 = xmm2[1,0]
 | 
			
		||||
	vaddsd	%xmm2, %xmm1, %xmm5
 | 
			
		||||
	movq	56(%rsp), %r14          # 8-byte Reload
 | 
			
		||||
	vmovapd	80(%rsp), %xmm12        # 16-byte Reload
 | 
			
		||||
	vmovapd	192(%rsp), %xmm13       # 16-byte Reload
 | 
			
		||||
	movq	48(%rsp), %rsi          # 8-byte Reload
 | 
			
		||||
	movq	40(%rsp), %rdi          # 8-byte Reload
 | 
			
		||||
	vmovdqa	208(%rsp), %xmm14       # 16-byte Reload
 | 
			
		||||
	vmovapd	176(%rsp), %xmm15       # 16-byte Reload
 | 
			
		||||
	vmovapd	%ymm9, %ymm8
 | 
			
		||||
	movq	72(%rsp), %rbx          # 8-byte Reload
 | 
			
		||||
	movq	64(%rsp), %rbp          # 8-byte Reload
 | 
			
		||||
	vmovapd	112(%rsp), %xmm0        # 16-byte Reload
 | 
			
		||||
	cmpq	%r13, %rdx
 | 
			
		||||
	jae	.LBB0_17
 | 
			
		||||
	jmp	.LBB0_11
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
.LBB0_4:                                # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_6 Depth=1
 | 
			
		||||
	movslq	%r13d, %rdx
 | 
			
		||||
	vxorpd	%xmm5, %xmm5, %xmm5
 | 
			
		||||
	vxorpd	%xmm11, %xmm11, %xmm11
 | 
			
		||||
	vxorpd	%xmm10, %xmm10, %xmm10
 | 
			
		||||
	jmp	.LBB0_5
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
.LBB0_16:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_6 Depth=1
 | 
			
		||||
	vxorpd	%xmm10, %xmm10, %xmm10
 | 
			
		||||
	vxorpd	%xmm11, %xmm11, %xmm11
 | 
			
		||||
	vxorpd	%xmm5, %xmm5, %xmm5
 | 
			
		||||
	cmpq	%r13, %rdx
 | 
			
		||||
	jae	.LBB0_17
 | 
			
		||||
.LBB0_11:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_6 Depth=1
 | 
			
		||||
	vmovapd	96(%rsp), %xmm4         # 16-byte Reload
 | 
			
		||||
	jmp	.LBB0_13
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
.LBB0_12:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_13 Depth=2
 | 
			
		||||
	incq	%rdx
 | 
			
		||||
	cmpq	%rdx, %r13
 | 
			
		||||
	je	.LBB0_17
 | 
			
		||||
.LBB0_13:                               # 
 | 
			
		||||
                                        #   Parent Loop BB0_6 Depth=1
 | 
			
		||||
                                        # =>  This Inner Loop Header: Depth=2
 | 
			
		||||
	movl	(%r11,%rdx,4), %eax
 | 
			
		||||
	leal	(%rax,%rax,2), %ecx
 | 
			
		||||
	movslq	%ecx, %rcx
 | 
			
		||||
	vsubsd	(%r15,%rcx,8), %xmm0, %xmm6
 | 
			
		||||
	leal	(%rax,%rax,2), %ecx
 | 
			
		||||
	incl	%ecx
 | 
			
		||||
	movslq	%ecx, %rcx
 | 
			
		||||
	vsubsd	(%r15,%rcx,8), %xmm4, %xmm2
 | 
			
		||||
	leal	2(%rax,%rax,2), %eax
 | 
			
		||||
	cltq
 | 
			
		||||
	vmovapd	(%rsp), %xmm1           # 16-byte Reload
 | 
			
		||||
	vsubsd	(%r15,%rax,8), %xmm1, %xmm1
 | 
			
		||||
	vmulsd	%xmm6, %xmm6, %xmm7
 | 
			
		||||
	vfmadd231sd	%xmm2, %xmm2, %xmm7 # xmm7 = (xmm2 * xmm2) + xmm7
 | 
			
		||||
	vfmadd231sd	%xmm1, %xmm1, %xmm7 # xmm7 = (xmm1 * xmm1) + xmm7
 | 
			
		||||
	vucomisd	%xmm13, %xmm7
 | 
			
		||||
	jae	.LBB0_12
 | 
			
		||||
# %bb.14:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_13 Depth=2
 | 
			
		||||
	vmovsd	.LCPI0_3(%rip), %xmm0   # xmm0 = mem[0],zero
 | 
			
		||||
	vdivsd	%xmm7, %xmm0, %xmm7
 | 
			
		||||
	vmulsd	%xmm7, %xmm7, %xmm0
 | 
			
		||||
	vmulsd	%xmm0, %xmm12, %xmm0
 | 
			
		||||
	vmulsd	%xmm7, %xmm0, %xmm0
 | 
			
		||||
	vaddsd	.LCPI0_4(%rip), %xmm0, %xmm4
 | 
			
		||||
	vmulsd	%xmm7, %xmm15, %xmm7
 | 
			
		||||
	vmulsd	%xmm0, %xmm7, %xmm0
 | 
			
		||||
	vmulsd	%xmm4, %xmm0, %xmm0
 | 
			
		||||
	vmovapd	96(%rsp), %xmm4         # 16-byte Reload
 | 
			
		||||
	vfmadd231sd	%xmm6, %xmm0, %xmm10 # xmm10 = (xmm0 * xmm6) + xmm10
 | 
			
		||||
	vfmadd231sd	%xmm2, %xmm0, %xmm11 # xmm11 = (xmm0 * xmm2) + xmm11
 | 
			
		||||
	vfmadd231sd	%xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5
 | 
			
		||||
	vmovapd	112(%rsp), %xmm0        # 16-byte Reload
 | 
			
		||||
	jmp	.LBB0_12
 | 
			
		||||
.LBB0_18:                               # 
 | 
			
		||||
	movq	24(%rsp), %rax          # 8-byte Reload
 | 
			
		||||
	vmovdqu	%xmm14, (%rax)
 | 
			
		||||
.LBB0_19:                               # 
 | 
			
		||||
	movl	$.L.str, %edi
 | 
			
		||||
	vzeroupper
 | 
			
		||||
	callq	likwid_markerStopRegion
 | 
			
		||||
	xorl	%eax, %eax
 | 
			
		||||
	callq	getTimeStamp
 | 
			
		||||
	vsubsd	32(%rsp), %xmm0, %xmm0  # 8-byte Folded Reload
 | 
			
		||||
	addq	$264, %rsp              # imm = 0x108
 | 
			
		||||
	.cfi_def_cfa_offset 56
 | 
			
		||||
	popq	%rbx
 | 
			
		||||
	.cfi_def_cfa_offset 48
 | 
			
		||||
	popq	%r12
 | 
			
		||||
	.cfi_def_cfa_offset 40
 | 
			
		||||
	popq	%r13
 | 
			
		||||
	.cfi_def_cfa_offset 32
 | 
			
		||||
	popq	%r14
 | 
			
		||||
	.cfi_def_cfa_offset 24
 | 
			
		||||
	popq	%r15
 | 
			
		||||
	.cfi_def_cfa_offset 16
 | 
			
		||||
	popq	%rbp
 | 
			
		||||
	.cfi_def_cfa_offset 8
 | 
			
		||||
	retq
 | 
			
		||||
.Lfunc_end0:
 | 
			
		||||
	.size	computeForceLJFullNeigh_plain_c, .Lfunc_end0-computeForceLJFullNeigh_plain_c
 | 
			
		||||
	.cfi_endproc
 | 
			
		||||
                                        # -- End function
 | 
			
		||||
	.section	.rodata.cst8,"aM",@progbits,8
 | 
			
		||||
	.p2align	3               # -- Begin function computeForceLJHalfNeigh
 | 
			
		||||
.LCPI1_0:
 | 
			
		||||
	.quad	4631952216750555136     #  48
 | 
			
		||||
.LCPI1_1:
 | 
			
		||||
	.quad	4607182418800017408     #  1
 | 
			
		||||
.LCPI1_2:
 | 
			
		||||
	.quad	-4620693217682128896    #  -0.5
 | 
			
		||||
	.text
 | 
			
		||||
	.globl	computeForceLJHalfNeigh
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
	.type	computeForceLJHalfNeigh,@function
 | 
			
		||||
computeForceLJHalfNeigh:                # 
 | 
			
		||||
.LcomputeForceLJHalfNeigh$local:
 | 
			
		||||
	.cfi_startproc
 | 
			
		||||
# %bb.0:                                # 
 | 
			
		||||
	pushq	%rbp
 | 
			
		||||
	.cfi_def_cfa_offset 16
 | 
			
		||||
	pushq	%r15
 | 
			
		||||
	.cfi_def_cfa_offset 24
 | 
			
		||||
	pushq	%r14
 | 
			
		||||
	.cfi_def_cfa_offset 32
 | 
			
		||||
	pushq	%r13
 | 
			
		||||
	.cfi_def_cfa_offset 40
 | 
			
		||||
	pushq	%r12
 | 
			
		||||
	.cfi_def_cfa_offset 48
 | 
			
		||||
	pushq	%rbx
 | 
			
		||||
	.cfi_def_cfa_offset 56
 | 
			
		||||
	subq	$40, %rsp
 | 
			
		||||
	.cfi_def_cfa_offset 96
 | 
			
		||||
	.cfi_offset %rbx, -56
 | 
			
		||||
	.cfi_offset %r12, -48
 | 
			
		||||
	.cfi_offset %r13, -40
 | 
			
		||||
	.cfi_offset %r14, -32
 | 
			
		||||
	.cfi_offset %r15, -24
 | 
			
		||||
	.cfi_offset %rbp, -16
 | 
			
		||||
	movq	%rcx, 16(%rsp)          # 8-byte Spill
 | 
			
		||||
	movq	%rdx, %r15
 | 
			
		||||
	movq	%rsi, %r12
 | 
			
		||||
	movl	4(%rsi), %r13d
 | 
			
		||||
	vmovsd	144(%rdi), %xmm0        # xmm0 = mem[0],zero
 | 
			
		||||
	vmovsd	%xmm0, 8(%rsp)          # 8-byte Spill
 | 
			
		||||
	vmovsd	40(%rdi), %xmm0         # xmm0 = mem[0],zero
 | 
			
		||||
	vmovsd	%xmm0, (%rsp)           # 8-byte Spill
 | 
			
		||||
	vmovsd	56(%rdi), %xmm0         # xmm0 = mem[0],zero
 | 
			
		||||
	vmovsd	%xmm0, 32(%rsp)         # 8-byte Spill
 | 
			
		||||
	testl	%r13d, %r13d
 | 
			
		||||
	jle	.LBB1_2
 | 
			
		||||
# %bb.1:                                # 
 | 
			
		||||
	movq	64(%r12), %rdi
 | 
			
		||||
	leaq	(,%r13,8), %rax
 | 
			
		||||
	leaq	(%rax,%rax,2), %rdx
 | 
			
		||||
	xorl	%esi, %esi
 | 
			
		||||
	callq	_intel_fast_memset
 | 
			
		||||
.LBB1_2:                                # 
 | 
			
		||||
	xorl	%eax, %eax
 | 
			
		||||
	callq	getTimeStamp
 | 
			
		||||
	vmovsd	%xmm0, 24(%rsp)         # 8-byte Spill
 | 
			
		||||
	movl	$.L.str.1, %edi
 | 
			
		||||
	callq	likwid_markerStartRegion
 | 
			
		||||
	testl	%r13d, %r13d
 | 
			
		||||
	jle	.LBB1_8
 | 
			
		||||
# %bb.3:                                # 
 | 
			
		||||
	vmovsd	8(%rsp), %xmm0          # 8-byte Reload
 | 
			
		||||
                                        # xmm0 = mem[0],zero
 | 
			
		||||
	vmulsd	%xmm0, %xmm0, %xmm12
 | 
			
		||||
	movq	16(%r15), %rax
 | 
			
		||||
	movq	24(%r15), %rcx
 | 
			
		||||
	movq	%rcx, 8(%rsp)           # 8-byte Spill
 | 
			
		||||
	movslq	8(%r15), %rdx
 | 
			
		||||
	movq	16(%r12), %rsi
 | 
			
		||||
	movq	64(%r12), %rdi
 | 
			
		||||
	vmovsd	(%rsp), %xmm0           # 8-byte Reload
 | 
			
		||||
                                        # xmm0 = mem[0],zero
 | 
			
		||||
	vmulsd	.LCPI1_0(%rip), %xmm0, %xmm11
 | 
			
		||||
	movq	16(%rsp), %rcx          # 8-byte Reload
 | 
			
		||||
	vmovdqu	(%rcx), %xmm10
 | 
			
		||||
	shlq	$2, %rdx
 | 
			
		||||
	movq	%rdx, (%rsp)            # 8-byte Spill
 | 
			
		||||
	xorl	%r12d, %r12d
 | 
			
		||||
	jmp	.LBB1_4
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
.LBB1_5:                                # 
 | 
			
		||||
                                        #   in Loop: Header=BB1_4 Depth=1
 | 
			
		||||
	vxorpd	%xmm13, %xmm13, %xmm13
 | 
			
		||||
	movq	%r9, %rdx
 | 
			
		||||
	vxorpd	%xmm9, %xmm9, %xmm9
 | 
			
		||||
	vxorpd	%xmm14, %xmm14, %xmm14
 | 
			
		||||
.LBB1_6:                                # 
 | 
			
		||||
                                        #   in Loop: Header=BB1_4 Depth=1
 | 
			
		||||
	vaddsd	(%rdi,%r15,8), %xmm14, %xmm0
 | 
			
		||||
	vmovsd	%xmm0, (%rdi,%r15,8)
 | 
			
		||||
	vaddsd	(%rdi,%r10,8), %xmm9, %xmm0
 | 
			
		||||
	vmovsd	%xmm0, (%rdi,%r10,8)
 | 
			
		||||
	vaddsd	(%rdi,%r11,8), %xmm13, %xmm0
 | 
			
		||||
	vmovsd	%xmm0, (%rdi,%r11,8)
 | 
			
		||||
	leal	3(%r9), %ecx
 | 
			
		||||
	addl	$6, %r9d
 | 
			
		||||
	testl	%ecx, %ecx
 | 
			
		||||
	cmovnsl	%ecx, %r9d
 | 
			
		||||
	sarl	$2, %r9d
 | 
			
		||||
	movslq	%r9d, %rcx
 | 
			
		||||
	vmovq	%rcx, %xmm0
 | 
			
		||||
	vmovq	%rdx, %xmm1
 | 
			
		||||
	vpunpcklqdq	%xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
 | 
			
		||||
	vpaddq	%xmm0, %xmm10, %xmm10
 | 
			
		||||
	incq	%r12
 | 
			
		||||
	addq	(%rsp), %rax            # 8-byte Folded Reload
 | 
			
		||||
	cmpq	%r13, %r12
 | 
			
		||||
	je	.LBB1_7
 | 
			
		||||
.LBB1_4:                                # 
 | 
			
		||||
                                        # =>This Loop Header: Depth=1
 | 
			
		||||
                                        #     Child Loop BB1_10 Depth 2
 | 
			
		||||
	movq	8(%rsp), %rcx           # 8-byte Reload
 | 
			
		||||
	movslq	(%rcx,%r12,4), %r9
 | 
			
		||||
	leaq	(%r12,%r12,2), %rcx
 | 
			
		||||
	leal	1(%rcx), %r10d
 | 
			
		||||
	leal	2(%rcx), %r11d
 | 
			
		||||
	movl	%ecx, %r15d
 | 
			
		||||
	testq	%r9, %r9
 | 
			
		||||
	jle	.LBB1_5
 | 
			
		||||
# %bb.9:                                # 
 | 
			
		||||
                                        #   in Loop: Header=BB1_4 Depth=1
 | 
			
		||||
	vmovsd	(%rsi,%r15,8), %xmm15   # xmm15 = mem[0],zero
 | 
			
		||||
	vmovsd	(%rsi,%r10,8), %xmm4    # xmm4 = mem[0],zero
 | 
			
		||||
	vmovsd	(%rsi,%r11,8), %xmm1    # xmm1 = mem[0],zero
 | 
			
		||||
	movl	%r9d, %edx
 | 
			
		||||
	vxorpd	%xmm14, %xmm14, %xmm14
 | 
			
		||||
	xorl	%ecx, %ecx
 | 
			
		||||
	vxorpd	%xmm9, %xmm9, %xmm9
 | 
			
		||||
	vxorpd	%xmm13, %xmm13, %xmm13
 | 
			
		||||
	jmp	.LBB1_10
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
.LBB1_13:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB1_10 Depth=2
 | 
			
		||||
	incq	%rcx
 | 
			
		||||
	cmpq	%rcx, %rdx
 | 
			
		||||
	je	.LBB1_6
 | 
			
		||||
.LBB1_10:                               # 
 | 
			
		||||
                                        #   Parent Loop BB1_4 Depth=1
 | 
			
		||||
                                        # =>  This Inner Loop Header: Depth=2
 | 
			
		||||
	movslq	(%rax,%rcx,4), %r8
 | 
			
		||||
	leaq	(%r8,%r8,2), %r14
 | 
			
		||||
	vsubsd	(%rsi,%r14,8), %xmm15, %xmm2
 | 
			
		||||
	movslq	%r14d, %rbp
 | 
			
		||||
	vsubsd	8(%rsi,%rbp,8), %xmm4, %xmm5
 | 
			
		||||
	vsubsd	16(%rsi,%rbp,8), %xmm1, %xmm0
 | 
			
		||||
	vmulsd	%xmm2, %xmm2, %xmm6
 | 
			
		||||
	vfmadd231sd	%xmm5, %xmm5, %xmm6 # xmm6 = (xmm5 * xmm5) + xmm6
 | 
			
		||||
	vfmadd231sd	%xmm0, %xmm0, %xmm6 # xmm6 = (xmm0 * xmm0) + xmm6
 | 
			
		||||
	vucomisd	%xmm12, %xmm6
 | 
			
		||||
	jae	.LBB1_13
 | 
			
		||||
# %bb.11:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB1_10 Depth=2
 | 
			
		||||
	vmovsd	.LCPI1_1(%rip), %xmm3   # xmm3 = mem[0],zero
 | 
			
		||||
	vdivsd	%xmm6, %xmm3, %xmm6
 | 
			
		||||
	vmulsd	32(%rsp), %xmm6, %xmm3  # 8-byte Folded Reload
 | 
			
		||||
	vmulsd	%xmm6, %xmm6, %xmm8
 | 
			
		||||
	vmulsd	%xmm3, %xmm8, %xmm3
 | 
			
		||||
	vaddsd	.LCPI1_2(%rip), %xmm3, %xmm7
 | 
			
		||||
	vmulsd	%xmm6, %xmm11, %xmm6
 | 
			
		||||
	vmulsd	%xmm3, %xmm6, %xmm3
 | 
			
		||||
	vmulsd	%xmm7, %xmm3, %xmm3
 | 
			
		||||
	vmulsd	%xmm2, %xmm3, %xmm6
 | 
			
		||||
	vaddsd	%xmm6, %xmm14, %xmm14
 | 
			
		||||
	vmulsd	%xmm5, %xmm3, %xmm2
 | 
			
		||||
	vaddsd	%xmm2, %xmm9, %xmm9
 | 
			
		||||
	vmulsd	%xmm0, %xmm3, %xmm0
 | 
			
		||||
	vaddsd	%xmm0, %xmm13, %xmm13
 | 
			
		||||
	cmpl	%r13d, %r8d
 | 
			
		||||
	jge	.LBB1_13
 | 
			
		||||
# %bb.12:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB1_10 Depth=2
 | 
			
		||||
	leaq	1(%rbp), %rbx
 | 
			
		||||
	addq	$2, %rbp
 | 
			
		||||
	vmovsd	(%rdi,%r14,8), %xmm3    # xmm3 = mem[0],zero
 | 
			
		||||
	vsubsd	%xmm6, %xmm3, %xmm3
 | 
			
		||||
	vmovsd	%xmm3, (%rdi,%r14,8)
 | 
			
		||||
	vmovsd	(%rdi,%rbx,8), %xmm3    # xmm3 = mem[0],zero
 | 
			
		||||
	vsubsd	%xmm2, %xmm3, %xmm2
 | 
			
		||||
	vmovsd	%xmm2, (%rdi,%rbx,8)
 | 
			
		||||
	vmovsd	(%rdi,%rbp,8), %xmm2    # xmm2 = mem[0],zero
 | 
			
		||||
	vsubsd	%xmm0, %xmm2, %xmm0
 | 
			
		||||
	vmovsd	%xmm0, (%rdi,%rbp,8)
 | 
			
		||||
	jmp	.LBB1_13
 | 
			
		||||
.LBB1_7:                                # 
 | 
			
		||||
	movq	16(%rsp), %rax          # 8-byte Reload
 | 
			
		||||
	vmovdqu	%xmm10, (%rax)
 | 
			
		||||
.LBB1_8:                                # 
 | 
			
		||||
	movl	$.L.str.1, %edi
 | 
			
		||||
	callq	likwid_markerStopRegion
 | 
			
		||||
	xorl	%eax, %eax
 | 
			
		||||
	callq	getTimeStamp
 | 
			
		||||
	vsubsd	24(%rsp), %xmm0, %xmm0  # 8-byte Folded Reload
 | 
			
		||||
	addq	$40, %rsp
 | 
			
		||||
	.cfi_def_cfa_offset 56
 | 
			
		||||
	popq	%rbx
 | 
			
		||||
	.cfi_def_cfa_offset 48
 | 
			
		||||
	popq	%r12
 | 
			
		||||
	.cfi_def_cfa_offset 40
 | 
			
		||||
	popq	%r13
 | 
			
		||||
	.cfi_def_cfa_offset 32
 | 
			
		||||
	popq	%r14
 | 
			
		||||
	.cfi_def_cfa_offset 24
 | 
			
		||||
	popq	%r15
 | 
			
		||||
	.cfi_def_cfa_offset 16
 | 
			
		||||
	popq	%rbp
 | 
			
		||||
	.cfi_def_cfa_offset 8
 | 
			
		||||
	retq
 | 
			
		||||
.Lfunc_end1:
 | 
			
		||||
	.size	computeForceLJHalfNeigh, .Lfunc_end1-computeForceLJHalfNeigh
 | 
			
		||||
	.cfi_endproc
 | 
			
		||||
                                        # -- End function
 | 
			
		||||
	.globl	computeForceLJFullNeigh_simd # -- Begin function computeForceLJFullNeigh_simd
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
	.type	computeForceLJFullNeigh_simd,@function
 | 
			
		||||
computeForceLJFullNeigh_simd:           # 
 | 
			
		||||
.LcomputeForceLJFullNeigh_simd$local:
 | 
			
		||||
	.cfi_startproc
 | 
			
		||||
# %bb.0:                                # 
 | 
			
		||||
	pushq	%rax
 | 
			
		||||
	.cfi_def_cfa_offset 16
 | 
			
		||||
	movl	4(%rsi), %eax
 | 
			
		||||
	testl	%eax, %eax
 | 
			
		||||
	jle	.LBB2_2
 | 
			
		||||
# %bb.1:                                # 
 | 
			
		||||
	movq	64(%rsi), %rdi
 | 
			
		||||
	shlq	$3, %rax
 | 
			
		||||
	leaq	(%rax,%rax,2), %rdx
 | 
			
		||||
	xorl	%esi, %esi
 | 
			
		||||
	callq	_intel_fast_memset
 | 
			
		||||
.LBB2_2:                                # 
 | 
			
		||||
	xorl	%eax, %eax
 | 
			
		||||
	callq	getTimeStamp
 | 
			
		||||
	movq	stderr(%rip), %rcx
 | 
			
		||||
	movl	$.L.str.2, %edi
 | 
			
		||||
	movl	$65, %esi
 | 
			
		||||
	movl	$1, %edx
 | 
			
		||||
	callq	fwrite
 | 
			
		||||
	movl	$-1, %edi
 | 
			
		||||
	callq	exit
 | 
			
		||||
.Lfunc_end2:
 | 
			
		||||
	.size	computeForceLJFullNeigh_simd, .Lfunc_end2-computeForceLJFullNeigh_simd
 | 
			
		||||
	.cfi_endproc
 | 
			
		||||
                                        # -- End function
 | 
			
		||||
	.type	.L.str,@object          # 
 | 
			
		||||
	.section	.rodata.str1.1,"aMS",@progbits,1
 | 
			
		||||
.L.str:
 | 
			
		||||
	.asciz	"force"
 | 
			
		||||
	.size	.L.str, 6
 | 
			
		||||
	.type	.L.str.1,@object        # 
 | 
			
		||||
.L.str.1:
 | 
			
		||||
	.asciz	"forceLJ-halfneigh"
 | 
			
		||||
	.size	.L.str.1, 18
 | 
			
		||||
	.type	.L.str.2,@object        # 
 | 
			
		||||
.L.str.2:
 | 
			
		||||
	.asciz	"Error: SIMD kernel not implemented for specified instruction set!"
 | 
			
		||||
	.size	.L.str.2, 66
 | 
			
		||||
	.ident	"Intel(R) oneAPI DPC++ Compiler 2021.1-beta05 (2020.2.0.0304)"
 | 
			
		||||
	.section	".note.GNU-stack","",@progbits
 | 
			
		||||
@@ -1,105 +0,0 @@
 | 
			
		||||
Open Source Architecture Code Analyzer (OSACA) - 0.4.12
 | 
			
		||||
Analyzed file:      force_lj_icx_avx2_markers.s
 | 
			
		||||
Architecture:       ZEN3
 | 
			
		||||
Timestamp:          2022-12-12 12:47:07
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 P - Throughput of LOAD operation can be hidden behind a past or future STORE instruction
 | 
			
		||||
 * - Instruction micro-ops not bound to a port
 | 
			
		||||
 X - No throughput/latency information for this instruction in data file
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Combined Analysis Report
 | 
			
		||||
------------------------
 | 
			
		||||
                                                           Port pressure in cycles                                                            
 | 
			
		||||
     |  0   |   1   |  2   |  3   | DV0  | DV1  |  4   |  5   |  6   |  7   |  8   - 8DV  |  9   |  10  |  11  |  12  |  13  ||  CP  | LCD  |
 | 
			
		||||
---------------------------------------------------------------------------------------------------------------------------------------------
 | 
			
		||||
 172 |      |       |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   .LBB0_9:                                #
 | 
			
		||||
 173 |      |       |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   #   Parent Loop BB0_6 Depth=1
 | 
			
		||||
 174 |      |       |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   # =>  This Inner Loop Header: Depth=2
 | 
			
		||||
 175 |      | 0.250 | 0.75 |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||  1.0 |      |   vpbroadcastd .LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3]
 | 
			
		||||
 176 | 0.00 |       |      | 1.00 |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||  3.0 |      |   vpmulld (%r11,%rbp,4), %xmm1, %xmm11
 | 
			
		||||
 177 | 0.00 | 1.010 | 0.25 | 0.74 |      |      |      |      |      |      |             |      |      |      |      |      ||  4.0 |      |   vpmovsxdq %xmm11, %ymm1
 | 
			
		||||
 178 |      | 0.000 | 1.00 |      |      |      |      |      |      |      |             |      |      |      |      |      ||  1.0 |      |   vpsllq $3, %ymm1, %ymm1
 | 
			
		||||
 179 | 0.00 | 0.000 | 0.49 | 0.51 |      |      |      |      |      |      |             |      |      |      |      |      ||  1.0 |      |   vpaddq %ymm1, %ymm3, %ymm1
 | 
			
		||||
 180 | 0.00 | 0.000 | 0.51 | 0.49 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vmovq %xmm1, %r14
 | 
			
		||||
 181 |      | 1.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpextrq $1, %xmm1, %r9
 | 
			
		||||
 182 |      | 1.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  3.0 |      |   vextracti128 $1, %ymm1, %xmm1
 | 
			
		||||
 183 |      |       |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovsd (%r14), %xmm2           # xmm2 = mem[0],zero
 | 
			
		||||
 184 | 0.00 | 0.000 | 0.49 | 0.51 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpsubd .LCPI0_5, %xmm11, %xmm6
 | 
			
		||||
 185 | 0.00 | 0.750 | 0.38 | 0.87 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpmovsxdq %xmm6, %ymm6
 | 
			
		||||
 186 |      | 0.000 | 1.00 |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpsllq $3, %ymm6, %ymm6
 | 
			
		||||
 187 | 0.00 | 0.000 | 0.50 | 0.50 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vmovq %xmm1, %rdi
 | 
			
		||||
 188 | 0.00 | 0.000 | 0.50 | 0.50 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpaddq %ymm6, %ymm3, %ymm6
 | 
			
		||||
 189 | 0.00 | 0.000 | 0.50 | 0.50 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vmovq %xmm6, %rcx
 | 
			
		||||
 190 |      | 1.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  6.0 |      |   vpextrq $1, %xmm1, %rbx
 | 
			
		||||
 191 |      | 1.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpextrq $1, %xmm6, %rax
 | 
			
		||||
 192 |      | 1.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vextracti128 $1, %ymm6, %xmm1
 | 
			
		||||
 193 |      |       |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovsd (%rdi), %xmm6           # xmm6 = mem[0],zero
 | 
			
		||||
 194 | 0.00 | 0.000 | 0.00 | 1.00 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vmovq %xmm1, %rdi
 | 
			
		||||
 195 |      | 1.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpextrq $1, %xmm1, %rsi
 | 
			
		||||
 196 |      |       |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovsd (%rdi), %xmm1           # xmm1 = mem[0],zero
 | 
			
		||||
 197 |      |       |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovsd (%rcx), %xmm7           # xmm7 = mem[0],zero
 | 
			
		||||
 198 |      | 0.000 | 1.00 |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vpbroadcastd .LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2]
 | 
			
		||||
 199 |      | 0.000 | 1.00 |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovhpd (%r9), %xmm2, %xmm2     # xmm2 = xmm2[0],mem[0]
 | 
			
		||||
 200 | 0.00 | 0.000 | 0.62 | 0.38 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpaddd %xmm12, %xmm11, %xmm4
 | 
			
		||||
 201 | 0.00 | 0.750 | 0.00 | 1.25 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpmovsxdq %xmm4, %ymm4
 | 
			
		||||
 202 |      | 0.000 | 1.00 |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovhpd (%rax), %xmm7, %xmm7    # xmm7 = xmm7[0],mem[0]
 | 
			
		||||
 203 |      | 0.000 | 1.00 |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpsllq $3, %ymm4, %ymm4
 | 
			
		||||
 204 | 0.00 | 0.000 | 0.00 | 1.00 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpaddq %ymm4, %ymm3, %ymm4
 | 
			
		||||
 205 |      | 0.000 | 1.00 |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||  5.0 |      |   vmovhpd (%rbx), %xmm6, %xmm6    # xmm6 = xmm6[0],mem[0]
 | 
			
		||||
 206 |      | 1.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpextrq $1, %xmm4, %rax
 | 
			
		||||
 207 |      | 0.000 | 1.00 |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovhpd (%rsi), %xmm1, %xmm1    # xmm1 = xmm1[0],mem[0]
 | 
			
		||||
 208 | 0.00 | 0.000 | 0.51 | 0.49 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vmovq %xmm4, %rcx
 | 
			
		||||
 209 |      | 1.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vextracti128 $1, %ymm4, %xmm4
 | 
			
		||||
 210 | 0.00 | -0.01 | 0.00 | 1.00 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vmovq %xmm4, %rsi
 | 
			
		||||
 211 |      | 1.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  1.0 |      |   vinsertf128 $1, %xmm6, %ymm2, %ymm2
 | 
			
		||||
 212 |      | 1.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vpextrq $1, %xmm4, %rdi
 | 
			
		||||
 213 |      |       |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovsd (%rsi), %xmm4           # xmm4 = mem[0],zero
 | 
			
		||||
 214 |      |       | 0.00 | 1.00 |      |      |      |      |      |      |             |      |      |      |      |      ||  3.0 |      |   vsubpd %ymm2, %ymm14, %ymm2
 | 
			
		||||
 215 |      | 0.000 | 1.00 |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovhpd (%rdi), %xmm4, %xmm4    # xmm4 = xmm4[0],mem[0]
 | 
			
		||||
 216 |      |       |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovsd (%rcx), %xmm6           # xmm6 = mem[0],zero
 | 
			
		||||
 217 |      | 1.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vinsertf128 $1, %xmm1, %ymm7, %ymm1
 | 
			
		||||
 218 |      | 0.000 | 1.00 |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmovhpd (%rax), %xmm6, %xmm6    # xmm6 = xmm6[0],mem[0]
 | 
			
		||||
 219 |      | 1.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vinsertf128 $1, %xmm4, %ymm6, %ymm4
 | 
			
		||||
 220 |      |       | 0.00 | 1.00 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vsubpd %ymm1, %ymm5, %ymm1
 | 
			
		||||
 221 |      |       | 0.00 | 1.00 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vsubpd %ymm4, %ymm10, %ymm4
 | 
			
		||||
 222 | 1.00 | 0.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  3.0 |      |   vmulpd %ymm2, %ymm2, %ymm6
 | 
			
		||||
 223 | 1.00 | 0.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  4.0 |      |   vfmadd231pd %ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6
 | 
			
		||||
 224 | 1.00 | 0.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  4.0 |      |   vfmadd231pd %ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6
 | 
			
		||||
 225 | 1.00 |       |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vbroadcastsd .LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 | 
			
		||||
 226 |      |       |      |      | 4.50 | 4.50 |      |      |      |      |             |      |      |      |      |      || 13.0 |      |   vdivpd %ymm6, %ymm7, %ymm7
 | 
			
		||||
 227 | 1.00 | 0.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  3.0 |      |   vmulpd %ymm7, %ymm7, %ymm11
 | 
			
		||||
 228 | 1.00 | 0.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  3.0 |      |   vmulpd %ymm9, %ymm11, %ymm11
 | 
			
		||||
 229 | 1.00 |       |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vbroadcastsd .LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
 | 
			
		||||
 230 | 1.00 | 0.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  3.0 |      |   vmulpd %ymm7, %ymm11, %ymm11
 | 
			
		||||
 231 |      |       | 0.00 | 1.00 |      |      |      |      |      |      |             |      |      |      |      |      ||  3.0 |      |   vaddpd %ymm12, %ymm11, %ymm12
 | 
			
		||||
 232 | 1.00 | 0.000 |      |      |      |      |      |      |      |      |             |      |      | 0.50 | 0.50 |      ||      |      |   vmulpd 128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
 | 
			
		||||
 233 | 1.00 | 0.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vmulpd %ymm7, %ymm11, %ymm7
 | 
			
		||||
 234 | 1.00 | 0.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  3.0 |      |   vmulpd %ymm7, %ymm12, %ymm7
 | 
			
		||||
 235 |      |       | 0.12 | 0.88 |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vcmpltpd %ymm8, %ymm6, %ymm6
 | 
			
		||||
 236 | 1.00 | 0.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  4.0 |      |   vfmadd213pd %ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0
 | 
			
		||||
 237 | 1.00 | 0.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||  1.0 |      |   vblendvpd %ymm6, %ymm2, %ymm0, %ymm0
 | 
			
		||||
 238 | 1.00 | 0.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vfmadd213pd %ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15
 | 
			
		||||
 239 | 1.00 | 0.000 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |  4.0 |   vfmadd213pd %ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13
 | 
			
		||||
 240 | 0.62 | 0.380 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |      |   vblendvpd %ymm6, %ymm1, %ymm15, %ymm15
 | 
			
		||||
 241 | 0.50 | 0.500 |      |      |      |      |      |      |      |      |             |      |      |      |      |      ||      |  1.0 |   vblendvpd %ymm6, %ymm4, %ymm13, %ymm13
 | 
			
		||||
 242 |      |       |      |      |      |      |      |      | 0.25 | 0.25 | 0.25        | 0.25 |      |      |      |      ||      |      |   addq $4, %rbp
 | 
			
		||||
 243 |      |       |      |      |      |      |      |      | 0.25 | 0.25 | 0.25        | 0.25 |      |      |      |      ||      |      |   cmpq %rdx, %rbp
 | 
			
		||||
 244 |      |       |      |      |      |      |      |      | 0.00 |      |             |      | 1.00 |      |      |      ||      |      |   jb .LBB0_9
 | 
			
		||||
 | 
			
		||||
       16.1   15.63   15.6   15.6   4.50   4.50                 0.50   0.50   0.50          0.50          9.00   9.00             72    5.0  
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Loop-Carried Dependencies Analysis Report
 | 
			
		||||
-----------------------------------------
 | 
			
		||||
 239 |  5.0 | vfmadd213pd	%ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13| [239, 241]
 | 
			
		||||
 238 |  5.0 | vfmadd213pd	%ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15| [238, 240]
 | 
			
		||||
 236 |  5.0 | vfmadd213pd	%ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0| [236, 237]
 | 
			
		||||
 242 |  1.0 | addq	$4, %rbp                       | [242]
 | 
			
		||||
 241 |  1.0 | vblendvpd	%ymm6, %ymm4, %ymm13, %ymm13| [241]
 | 
			
		||||
 240 |  1.0 | vblendvpd	%ymm6, %ymm1, %ymm15, %ymm15| [240]
 | 
			
		||||
 237 |  1.0 | vblendvpd	%ymm6, %ymm2, %ymm0, %ymm0| [237]
 | 
			
		||||
 | 
			
		||||
@@ -1,638 +0,0 @@
 | 
			
		||||
	.text
 | 
			
		||||
	.file	"force_lj.c"
 | 
			
		||||
	.section	.rodata.cst8,"aM",@progbits,8
 | 
			
		||||
	.p2align	3               # -- Begin function computeForceLJFullNeigh_plain_c
 | 
			
		||||
.LCPI0_0:
 | 
			
		||||
	.quad	4631952216750555136     #  48
 | 
			
		||||
.LCPI0_3:
 | 
			
		||||
	.quad	4607182418800017408     #  1
 | 
			
		||||
.LCPI0_4:
 | 
			
		||||
	.quad	-4620693217682128896    #  -0.5
 | 
			
		||||
	.section	.rodata.cst4,"aM",@progbits,4
 | 
			
		||||
	.p2align	2
 | 
			
		||||
.LCPI0_1:
 | 
			
		||||
	.long	3                       # 0x3
 | 
			
		||||
.LCPI0_2:
 | 
			
		||||
	.long	2                       # 0x2
 | 
			
		||||
	.section	.rodata.cst16,"aM",@progbits,16
 | 
			
		||||
	.p2align	4
 | 
			
		||||
.LCPI0_5:
 | 
			
		||||
	.zero	16,255
 | 
			
		||||
	.text
 | 
			
		||||
	.globl	computeForceLJFullNeigh_plain_c
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
	.type	computeForceLJFullNeigh_plain_c,@function
 | 
			
		||||
computeForceLJFullNeigh_plain_c:        # 
 | 
			
		||||
.LcomputeForceLJFullNeigh_plain_c$local:
 | 
			
		||||
	.cfi_startproc
 | 
			
		||||
# %bb.0:                                # 
 | 
			
		||||
	pushq	%rbp
 | 
			
		||||
	.cfi_def_cfa_offset 16
 | 
			
		||||
	pushq	%r15
 | 
			
		||||
	.cfi_def_cfa_offset 24
 | 
			
		||||
	pushq	%r14
 | 
			
		||||
	.cfi_def_cfa_offset 32
 | 
			
		||||
	pushq	%r13
 | 
			
		||||
	.cfi_def_cfa_offset 40
 | 
			
		||||
	pushq	%r12
 | 
			
		||||
	.cfi_def_cfa_offset 48
 | 
			
		||||
	pushq	%rbx
 | 
			
		||||
	.cfi_def_cfa_offset 56
 | 
			
		||||
	subq	$264, %rsp              # imm = 0x108
 | 
			
		||||
	.cfi_def_cfa_offset 320
 | 
			
		||||
	.cfi_offset %rbx, -56
 | 
			
		||||
	.cfi_offset %r12, -48
 | 
			
		||||
	.cfi_offset %r13, -40
 | 
			
		||||
	.cfi_offset %r14, -32
 | 
			
		||||
	.cfi_offset %r15, -24
 | 
			
		||||
	.cfi_offset %rbp, -16
 | 
			
		||||
	movq	%rcx, %rbx
 | 
			
		||||
	movq	%rdx, %r15
 | 
			
		||||
	movq	%rsi, %r12
 | 
			
		||||
	movl	4(%rsi), %r14d
 | 
			
		||||
	vmovsd	144(%rdi), %xmm0        # xmm0 = mem[0],zero
 | 
			
		||||
	vmovsd	%xmm0, (%rsp)           # 8-byte Spill
 | 
			
		||||
	vmovsd	40(%rdi), %xmm0         # xmm0 = mem[0],zero
 | 
			
		||||
	vmovsd	%xmm0, 128(%rsp)        # 8-byte Spill
 | 
			
		||||
	vmovq	56(%rdi), %xmm0         # xmm0 = mem[0],zero
 | 
			
		||||
	vmovdqa	%xmm0, 80(%rsp)         # 16-byte Spill
 | 
			
		||||
	testl	%r14d, %r14d
 | 
			
		||||
	jle	.LBB0_2
 | 
			
		||||
# %bb.1:                                # 
 | 
			
		||||
	movq	64(%r12), %rdi
 | 
			
		||||
	leaq	(,%r14,8), %rax
 | 
			
		||||
	leaq	(%rax,%rax,2), %rdx
 | 
			
		||||
	xorl	%esi, %esi
 | 
			
		||||
	callq	_intel_fast_memset
 | 
			
		||||
.LBB0_2:                                # 
 | 
			
		||||
	xorl	%eax, %eax
 | 
			
		||||
	callq	getTimeStamp
 | 
			
		||||
	vmovq	%xmm0, 32(%rsp)         # 8-byte Folded Spill
 | 
			
		||||
	movl	$.L.str, %edi
 | 
			
		||||
	callq	likwid_markerStartRegion
 | 
			
		||||
	testl	%r14d, %r14d
 | 
			
		||||
	jle	.LBB0_19
 | 
			
		||||
# %bb.3:                                # 
 | 
			
		||||
	vmovsd	(%rsp), %xmm0           # 8-byte Reload
 | 
			
		||||
                                        # xmm0 = mem[0],zero
 | 
			
		||||
	vmulsd	%xmm0, %xmm0, %xmm13
 | 
			
		||||
	movq	16(%r15), %r11
 | 
			
		||||
	movq	24(%r15), %rsi
 | 
			
		||||
	movslq	8(%r15), %rdi
 | 
			
		||||
	movq	16(%r12), %r15
 | 
			
		||||
	movq	64(%r12), %r8
 | 
			
		||||
	vmovsd	128(%rsp), %xmm0        # 8-byte Reload
 | 
			
		||||
                                        # xmm0 = mem[0],zero
 | 
			
		||||
	vmulsd	.LCPI0_0(%rip), %xmm0, %xmm15
 | 
			
		||||
	movq	%rbx, 24(%rsp)          # 8-byte Spill
 | 
			
		||||
	vmovdqu	(%rbx), %xmm14
 | 
			
		||||
	decq	%r14
 | 
			
		||||
	vmovq	%r15, %xmm0
 | 
			
		||||
	vpbroadcastq	%xmm0, %ymm3
 | 
			
		||||
	vbroadcastsd	%xmm13, %ymm2
 | 
			
		||||
	vmovapd	80(%rsp), %xmm12        # 16-byte Reload
 | 
			
		||||
	vbroadcastsd	%xmm12, %ymm8
 | 
			
		||||
	vbroadcastsd	%xmm15, %ymm9
 | 
			
		||||
	shlq	$2, %rdi
 | 
			
		||||
	xorl	%r10d, %r10d
 | 
			
		||||
	movq	%r14, 56(%rsp)          # 8-byte Spill
 | 
			
		||||
	vmovapd	%xmm13, 192(%rsp)       # 16-byte Spill
 | 
			
		||||
	movq	%rsi, 48(%rsp)          # 8-byte Spill
 | 
			
		||||
	movq	%rdi, 40(%rsp)          # 8-byte Spill
 | 
			
		||||
	vmovapd	%xmm15, 176(%rsp)       # 16-byte Spill
 | 
			
		||||
	vmovupd	%ymm2, 224(%rsp)        # 32-byte Spill
 | 
			
		||||
	vmovupd	%ymm9, 128(%rsp)        # 32-byte Spill
 | 
			
		||||
	jmp	.LBB0_6
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
.LBB0_17:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_6 Depth=1
 | 
			
		||||
	movq	%r13, %rdx
 | 
			
		||||
.LBB0_5:                                # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_6 Depth=1
 | 
			
		||||
	vaddsd	(%r8,%r12,8), %xmm10, %xmm0
 | 
			
		||||
	vmovsd	%xmm0, (%r8,%r12,8)
 | 
			
		||||
	vaddsd	(%r8,%rbx,8), %xmm11, %xmm0
 | 
			
		||||
	vmovsd	%xmm0, (%r8,%rbx,8)
 | 
			
		||||
	vaddsd	(%r8,%rbp,8), %xmm5, %xmm0
 | 
			
		||||
	vmovsd	%xmm0, (%r8,%rbp,8)
 | 
			
		||||
	leal	3(%r13), %eax
 | 
			
		||||
	addl	$6, %r13d
 | 
			
		||||
	testl	%eax, %eax
 | 
			
		||||
	cmovnsl	%eax, %r13d
 | 
			
		||||
	sarl	$2, %r13d
 | 
			
		||||
	movslq	%r13d, %rax
 | 
			
		||||
	vmovq	%rax, %xmm0
 | 
			
		||||
	vmovq	%rdx, %xmm1
 | 
			
		||||
	vpunpcklqdq	%xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
 | 
			
		||||
	vpaddq	%xmm0, %xmm14, %xmm14
 | 
			
		||||
	addq	%rdi, %r11
 | 
			
		||||
	cmpq	%r14, %r10
 | 
			
		||||
	leaq	1(%r10), %r10
 | 
			
		||||
	je	.LBB0_18
 | 
			
		||||
.LBB0_6:                                # 
 | 
			
		||||
                                        # =>This Loop Header: Depth=1
 | 
			
		||||
                                        #     Child Loop BB0_9 Depth 2
 | 
			
		||||
                                        #     Child Loop BB0_13 Depth 2
 | 
			
		||||
	movl	(%rsi,%r10,4), %r13d
 | 
			
		||||
	leal	(%r10,%r10,2), %r12d
 | 
			
		||||
	leal	(%r10,%r10,2), %ebx
 | 
			
		||||
	incl	%ebx
 | 
			
		||||
	leal	(%r10,%r10,2), %ebp
 | 
			
		||||
	addl	$2, %ebp
 | 
			
		||||
	testl	%r13d, %r13d
 | 
			
		||||
	jle	.LBB0_4
 | 
			
		||||
# %bb.7:                                # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_6 Depth=1
 | 
			
		||||
	vmovsd	(%r15,%r12,8), %xmm0    # xmm0 = mem[0],zero
 | 
			
		||||
	vmovsd	(%r15,%rbx,8), %xmm1    # xmm1 = mem[0],zero
 | 
			
		||||
	vmovsd	(%r15,%rbp,8), %xmm2    # xmm2 = mem[0],zero
 | 
			
		||||
	movq	%r13, %rdx
 | 
			
		||||
	movl	$4294967292, %eax       # imm = 0xFFFFFFFC
 | 
			
		||||
	andq	%rax, %rdx
 | 
			
		||||
	vmovapd	%xmm0, 112(%rsp)        # 16-byte Spill
 | 
			
		||||
	vmovapd	%xmm1, 96(%rsp)         # 16-byte Spill
 | 
			
		||||
	vmovapd	%xmm2, (%rsp)           # 16-byte Spill
 | 
			
		||||
	je	.LBB0_16
 | 
			
		||||
# %bb.8:                                # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_6 Depth=1
 | 
			
		||||
	movq	%rbp, 64(%rsp)          # 8-byte Spill
 | 
			
		||||
	movq	%rbx, 72(%rsp)          # 8-byte Spill
 | 
			
		||||
	vmovdqa	%xmm14, 208(%rsp)       # 16-byte Spill
 | 
			
		||||
	vbroadcastsd	%xmm0, %ymm14
 | 
			
		||||
	vbroadcastsd	%xmm1, %ymm5
 | 
			
		||||
	vbroadcastsd	%xmm2, %ymm10
 | 
			
		||||
	vxorpd	%xmm0, %xmm0, %xmm0
 | 
			
		||||
	vxorpd	%xmm15, %xmm15, %xmm15
 | 
			
		||||
	vxorpd	%xmm13, %xmm13, %xmm13
 | 
			
		||||
	xorl	%ebp, %ebp
 | 
			
		||||
	vmovapd	%ymm8, %ymm9
 | 
			
		||||
	vmovupd	224(%rsp), %ymm8        # 32-byte Reload
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
    # OSACA-BEGIN
 | 
			
		||||
    # LLVM-MCA-BEGIN
 | 
			
		||||
.LBB0_9:                                # 
 | 
			
		||||
                                        #   Parent Loop BB0_6 Depth=1
 | 
			
		||||
                                        # =>  This Inner Loop Header: Depth=2 
 | 
			
		||||
	vpbroadcastd	.LCPI0_1(%rip), %xmm1 # xmm1 = [3,3,3,3]
 | 
			
		||||
	vpmulld	(%r11,%rbp,4), %xmm1, %xmm11
 | 
			
		||||
	vpmovsxdq	%xmm11, %ymm1
 | 
			
		||||
	vpsllq	$3, %ymm1, %ymm1
 | 
			
		||||
	vpaddq	%ymm1, %ymm3, %ymm1
 | 
			
		||||
	vmovq	%xmm1, %r14
 | 
			
		||||
	vpextrq	$1, %xmm1, %r9
 | 
			
		||||
	vextracti128	$1, %ymm1, %xmm1
 | 
			
		||||
	vmovsd	(%r14), %xmm2           # xmm2 = mem[0],zero
 | 
			
		||||
	vpsubd	.LCPI0_5, %xmm11, %xmm6
 | 
			
		||||
	vpmovsxdq	%xmm6, %ymm6
 | 
			
		||||
	vpsllq	$3, %ymm6, %ymm6
 | 
			
		||||
	vmovq	%xmm1, %rdi
 | 
			
		||||
	vpaddq	%ymm6, %ymm3, %ymm6
 | 
			
		||||
	vmovq	%xmm6, %rcx
 | 
			
		||||
	vpextrq	$1, %xmm1, %rbx
 | 
			
		||||
	vpextrq	$1, %xmm6, %rax
 | 
			
		||||
	vextracti128	$1, %ymm6, %xmm1
 | 
			
		||||
	vmovsd	(%rdi), %xmm6           # xmm6 = mem[0],zero
 | 
			
		||||
	vmovq	%xmm1, %rdi
 | 
			
		||||
	vpextrq	$1, %xmm1, %rsi
 | 
			
		||||
	vmovsd	(%rdi), %xmm1           # xmm1 = mem[0],zero
 | 
			
		||||
	vmovsd	(%rcx), %xmm7           # xmm7 = mem[0],zero
 | 
			
		||||
	vpbroadcastd	.LCPI0_2(%rip), %xmm12 # xmm12 = [2,2,2,2]
 | 
			
		||||
	vmovhpd	(%r9), %xmm2, %xmm2     # xmm2 = xmm2[0],mem[0]
 | 
			
		||||
	vpaddd	%xmm12, %xmm11, %xmm4
 | 
			
		||||
	vpmovsxdq	%xmm4, %ymm4
 | 
			
		||||
	vmovhpd	(%rax), %xmm7, %xmm7    # xmm7 = xmm7[0],mem[0]
 | 
			
		||||
	vpsllq	$3, %ymm4, %ymm4
 | 
			
		||||
	vpaddq	%ymm4, %ymm3, %ymm4
 | 
			
		||||
	vmovhpd	(%rbx), %xmm6, %xmm6    # xmm6 = xmm6[0],mem[0]
 | 
			
		||||
	vpextrq	$1, %xmm4, %rax
 | 
			
		||||
	vmovhpd	(%rsi), %xmm1, %xmm1    # xmm1 = xmm1[0],mem[0]
 | 
			
		||||
	vmovq	%xmm4, %rcx
 | 
			
		||||
	vextracti128	$1, %ymm4, %xmm4
 | 
			
		||||
	vmovq	%xmm4, %rsi
 | 
			
		||||
	vinsertf128	$1, %xmm6, %ymm2, %ymm2
 | 
			
		||||
	vpextrq	$1, %xmm4, %rdi
 | 
			
		||||
	vmovsd	(%rsi), %xmm4           # xmm4 = mem[0],zero
 | 
			
		||||
	vsubpd	%ymm2, %ymm14, %ymm2
 | 
			
		||||
	vmovhpd	(%rdi), %xmm4, %xmm4    # xmm4 = xmm4[0],mem[0]
 | 
			
		||||
	vmovsd	(%rcx), %xmm6           # xmm6 = mem[0],zero
 | 
			
		||||
	vinsertf128	$1, %xmm1, %ymm7, %ymm1
 | 
			
		||||
	vmovhpd	(%rax), %xmm6, %xmm6    # xmm6 = xmm6[0],mem[0]
 | 
			
		||||
	vinsertf128	$1, %xmm4, %ymm6, %ymm4
 | 
			
		||||
	vsubpd	%ymm1, %ymm5, %ymm1
 | 
			
		||||
	vsubpd	%ymm4, %ymm10, %ymm4
 | 
			
		||||
	vmulpd	%ymm2, %ymm2, %ymm6
 | 
			
		||||
	vfmadd231pd	%ymm1, %ymm1, %ymm6 # ymm6 = (ymm1 * ymm1) + ymm6
 | 
			
		||||
	vfmadd231pd	%ymm4, %ymm4, %ymm6 # ymm6 = (ymm4 * ymm4) + ymm6
 | 
			
		||||
	vbroadcastsd	.LCPI0_3(%rip), %ymm7 # ymm7 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 | 
			
		||||
	vdivpd	%ymm6, %ymm7, %ymm7
 | 
			
		||||
	vmulpd	%ymm7, %ymm7, %ymm11
 | 
			
		||||
	vmulpd	%ymm9, %ymm11, %ymm11
 | 
			
		||||
	vbroadcastsd	.LCPI0_4(%rip), %ymm12 # ymm12 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
 | 
			
		||||
	vmulpd	%ymm7, %ymm11, %ymm11
 | 
			
		||||
	vaddpd	%ymm12, %ymm11, %ymm12
 | 
			
		||||
	vmulpd	128(%rsp), %ymm7, %ymm7 # 32-byte Folded Reload
 | 
			
		||||
	vmulpd	%ymm7, %ymm11, %ymm7
 | 
			
		||||
	vmulpd	%ymm7, %ymm12, %ymm7
 | 
			
		||||
	vcmpltpd	%ymm8, %ymm6, %ymm6
 | 
			
		||||
	vfmadd213pd	%ymm0, %ymm7, %ymm2 # ymm2 = (ymm7 * ymm2) + ymm0
 | 
			
		||||
	vblendvpd	%ymm6, %ymm2, %ymm0, %ymm0
 | 
			
		||||
	vfmadd213pd	%ymm15, %ymm7, %ymm1 # ymm1 = (ymm7 * ymm1) + ymm15
 | 
			
		||||
	vfmadd213pd	%ymm13, %ymm7, %ymm4 # ymm4 = (ymm7 * ymm4) + ymm13
 | 
			
		||||
	vblendvpd	%ymm6, %ymm1, %ymm15, %ymm15
 | 
			
		||||
	vblendvpd	%ymm6, %ymm4, %ymm13, %ymm13
 | 
			
		||||
	addq	$4, %rbp
 | 
			
		||||
	cmpq	%rdx, %rbp
 | 
			
		||||
	jb	.LBB0_9
 | 
			
		||||
    # LLVM-MCA-END
 | 
			
		||||
    # OSACA-END
 | 
			
		||||
# %bb.10:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_6 Depth=1
 | 
			
		||||
	vpermilpd	$1, %xmm0, %xmm1 # xmm1 = xmm0[1,0]
 | 
			
		||||
	vaddsd	%xmm1, %xmm0, %xmm1
 | 
			
		||||
	vextractf128	$1, %ymm0, %xmm0
 | 
			
		||||
	vaddsd	%xmm0, %xmm1, %xmm1
 | 
			
		||||
	vpermilpd	$1, %xmm0, %xmm0 # xmm0 = xmm0[1,0]
 | 
			
		||||
	vaddsd	%xmm0, %xmm1, %xmm10
 | 
			
		||||
	vpermilpd	$1, %xmm15, %xmm1 # xmm1 = xmm15[1,0]
 | 
			
		||||
	vaddsd	%xmm1, %xmm15, %xmm1
 | 
			
		||||
	vextractf128	$1, %ymm15, %xmm2
 | 
			
		||||
	vaddsd	%xmm2, %xmm1, %xmm1
 | 
			
		||||
	vpermilpd	$1, %xmm2, %xmm2 # xmm2 = xmm2[1,0]
 | 
			
		||||
	vaddsd	%xmm2, %xmm1, %xmm11
 | 
			
		||||
	vpermilpd	$1, %xmm13, %xmm1 # xmm1 = xmm13[1,0]
 | 
			
		||||
	vaddsd	%xmm1, %xmm13, %xmm1
 | 
			
		||||
	vextractf128	$1, %ymm13, %xmm2
 | 
			
		||||
	vaddsd	%xmm2, %xmm1, %xmm1
 | 
			
		||||
	vpermilpd	$1, %xmm2, %xmm2 # xmm2 = xmm2[1,0]
 | 
			
		||||
	vaddsd	%xmm2, %xmm1, %xmm5
 | 
			
		||||
	movq	56(%rsp), %r14          # 8-byte Reload
 | 
			
		||||
	vmovapd	80(%rsp), %xmm12        # 16-byte Reload
 | 
			
		||||
	vmovapd	192(%rsp), %xmm13       # 16-byte Reload
 | 
			
		||||
	movq	48(%rsp), %rsi          # 8-byte Reload
 | 
			
		||||
	movq	40(%rsp), %rdi          # 8-byte Reload
 | 
			
		||||
	vmovdqa	208(%rsp), %xmm14       # 16-byte Reload
 | 
			
		||||
	vmovapd	176(%rsp), %xmm15       # 16-byte Reload
 | 
			
		||||
	vmovapd	%ymm9, %ymm8
 | 
			
		||||
	movq	72(%rsp), %rbx          # 8-byte Reload
 | 
			
		||||
	movq	64(%rsp), %rbp          # 8-byte Reload
 | 
			
		||||
	vmovapd	112(%rsp), %xmm0        # 16-byte Reload
 | 
			
		||||
	cmpq	%r13, %rdx
 | 
			
		||||
	jae	.LBB0_17
 | 
			
		||||
	jmp	.LBB0_11
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
.LBB0_4:                                # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_6 Depth=1
 | 
			
		||||
	movslq	%r13d, %rdx
 | 
			
		||||
	vxorpd	%xmm5, %xmm5, %xmm5
 | 
			
		||||
	vxorpd	%xmm11, %xmm11, %xmm11
 | 
			
		||||
	vxorpd	%xmm10, %xmm10, %xmm10
 | 
			
		||||
	jmp	.LBB0_5
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
.LBB0_16:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_6 Depth=1
 | 
			
		||||
	vxorpd	%xmm10, %xmm10, %xmm10
 | 
			
		||||
	vxorpd	%xmm11, %xmm11, %xmm11
 | 
			
		||||
	vxorpd	%xmm5, %xmm5, %xmm5
 | 
			
		||||
	cmpq	%r13, %rdx
 | 
			
		||||
	jae	.LBB0_17
 | 
			
		||||
.LBB0_11:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_6 Depth=1
 | 
			
		||||
	vmovapd	96(%rsp), %xmm4         # 16-byte Reload
 | 
			
		||||
	jmp	.LBB0_13
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
.LBB0_12:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_13 Depth=2
 | 
			
		||||
	incq	%rdx
 | 
			
		||||
	cmpq	%rdx, %r13
 | 
			
		||||
	je	.LBB0_17
 | 
			
		||||
.LBB0_13:                               # 
 | 
			
		||||
                                        #   Parent Loop BB0_6 Depth=1
 | 
			
		||||
                                        # =>  This Inner Loop Header: Depth=2
 | 
			
		||||
	movl	(%r11,%rdx,4), %eax
 | 
			
		||||
	leal	(%rax,%rax,2), %ecx
 | 
			
		||||
	movslq	%ecx, %rcx
 | 
			
		||||
	vsubsd	(%r15,%rcx,8), %xmm0, %xmm6
 | 
			
		||||
	leal	(%rax,%rax,2), %ecx
 | 
			
		||||
	incl	%ecx
 | 
			
		||||
	movslq	%ecx, %rcx
 | 
			
		||||
	vsubsd	(%r15,%rcx,8), %xmm4, %xmm2
 | 
			
		||||
	leal	2(%rax,%rax,2), %eax
 | 
			
		||||
	cltq
 | 
			
		||||
	vmovapd	(%rsp), %xmm1           # 16-byte Reload
 | 
			
		||||
	vsubsd	(%r15,%rax,8), %xmm1, %xmm1
 | 
			
		||||
	vmulsd	%xmm6, %xmm6, %xmm7
 | 
			
		||||
	vfmadd231sd	%xmm2, %xmm2, %xmm7 # xmm7 = (xmm2 * xmm2) + xmm7
 | 
			
		||||
	vfmadd231sd	%xmm1, %xmm1, %xmm7 # xmm7 = (xmm1 * xmm1) + xmm7
 | 
			
		||||
	vucomisd	%xmm13, %xmm7
 | 
			
		||||
	jae	.LBB0_12
 | 
			
		||||
# %bb.14:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB0_13 Depth=2
 | 
			
		||||
	vmovsd	.LCPI0_3(%rip), %xmm0   # xmm0 = mem[0],zero
 | 
			
		||||
	vdivsd	%xmm7, %xmm0, %xmm7
 | 
			
		||||
	vmulsd	%xmm7, %xmm7, %xmm0
 | 
			
		||||
	vmulsd	%xmm0, %xmm12, %xmm0
 | 
			
		||||
	vmulsd	%xmm7, %xmm0, %xmm0
 | 
			
		||||
	vaddsd	.LCPI0_4(%rip), %xmm0, %xmm4
 | 
			
		||||
	vmulsd	%xmm7, %xmm15, %xmm7
 | 
			
		||||
	vmulsd	%xmm0, %xmm7, %xmm0
 | 
			
		||||
	vmulsd	%xmm4, %xmm0, %xmm0
 | 
			
		||||
	vmovapd	96(%rsp), %xmm4         # 16-byte Reload
 | 
			
		||||
	vfmadd231sd	%xmm6, %xmm0, %xmm10 # xmm10 = (xmm0 * xmm6) + xmm10
 | 
			
		||||
	vfmadd231sd	%xmm2, %xmm0, %xmm11 # xmm11 = (xmm0 * xmm2) + xmm11
 | 
			
		||||
	vfmadd231sd	%xmm1, %xmm0, %xmm5 # xmm5 = (xmm0 * xmm1) + xmm5
 | 
			
		||||
	vmovapd	112(%rsp), %xmm0        # 16-byte Reload
 | 
			
		||||
	jmp	.LBB0_12
 | 
			
		||||
.LBB0_18:                               # 
 | 
			
		||||
	movq	24(%rsp), %rax          # 8-byte Reload
 | 
			
		||||
	vmovdqu	%xmm14, (%rax)
 | 
			
		||||
.LBB0_19:                               # 
 | 
			
		||||
	movl	$.L.str, %edi
 | 
			
		||||
	vzeroupper
 | 
			
		||||
	callq	likwid_markerStopRegion
 | 
			
		||||
	xorl	%eax, %eax
 | 
			
		||||
	callq	getTimeStamp
 | 
			
		||||
	vsubsd	32(%rsp), %xmm0, %xmm0  # 8-byte Folded Reload
 | 
			
		||||
	addq	$264, %rsp              # imm = 0x108
 | 
			
		||||
	.cfi_def_cfa_offset 56
 | 
			
		||||
	popq	%rbx
 | 
			
		||||
	.cfi_def_cfa_offset 48
 | 
			
		||||
	popq	%r12
 | 
			
		||||
	.cfi_def_cfa_offset 40
 | 
			
		||||
	popq	%r13
 | 
			
		||||
	.cfi_def_cfa_offset 32
 | 
			
		||||
	popq	%r14
 | 
			
		||||
	.cfi_def_cfa_offset 24
 | 
			
		||||
	popq	%r15
 | 
			
		||||
	.cfi_def_cfa_offset 16
 | 
			
		||||
	popq	%rbp
 | 
			
		||||
	.cfi_def_cfa_offset 8
 | 
			
		||||
	retq
 | 
			
		||||
.Lfunc_end0:
 | 
			
		||||
	.size	computeForceLJFullNeigh_plain_c, .Lfunc_end0-computeForceLJFullNeigh_plain_c
 | 
			
		||||
	.cfi_endproc
 | 
			
		||||
                                        # -- End function
 | 
			
		||||
	.section	.rodata.cst8,"aM",@progbits,8
 | 
			
		||||
	.p2align	3               # -- Begin function computeForceLJHalfNeigh
 | 
			
		||||
.LCPI1_0:
 | 
			
		||||
	.quad	4631952216750555136     #  48
 | 
			
		||||
.LCPI1_1:
 | 
			
		||||
	.quad	4607182418800017408     #  1
 | 
			
		||||
.LCPI1_2:
 | 
			
		||||
	.quad	-4620693217682128896    #  -0.5
 | 
			
		||||
	.text
 | 
			
		||||
	.globl	computeForceLJHalfNeigh
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
	.type	computeForceLJHalfNeigh,@function
 | 
			
		||||
computeForceLJHalfNeigh:                # 
 | 
			
		||||
.LcomputeForceLJHalfNeigh$local:
 | 
			
		||||
	.cfi_startproc
 | 
			
		||||
# %bb.0:                                # 
 | 
			
		||||
	pushq	%rbp
 | 
			
		||||
	.cfi_def_cfa_offset 16
 | 
			
		||||
	pushq	%r15
 | 
			
		||||
	.cfi_def_cfa_offset 24
 | 
			
		||||
	pushq	%r14
 | 
			
		||||
	.cfi_def_cfa_offset 32
 | 
			
		||||
	pushq	%r13
 | 
			
		||||
	.cfi_def_cfa_offset 40
 | 
			
		||||
	pushq	%r12
 | 
			
		||||
	.cfi_def_cfa_offset 48
 | 
			
		||||
	pushq	%rbx
 | 
			
		||||
	.cfi_def_cfa_offset 56
 | 
			
		||||
	subq	$40, %rsp
 | 
			
		||||
	.cfi_def_cfa_offset 96
 | 
			
		||||
	.cfi_offset %rbx, -56
 | 
			
		||||
	.cfi_offset %r12, -48
 | 
			
		||||
	.cfi_offset %r13, -40
 | 
			
		||||
	.cfi_offset %r14, -32
 | 
			
		||||
	.cfi_offset %r15, -24
 | 
			
		||||
	.cfi_offset %rbp, -16
 | 
			
		||||
	movq	%rcx, 16(%rsp)          # 8-byte Spill
 | 
			
		||||
	movq	%rdx, %r15
 | 
			
		||||
	movq	%rsi, %r12
 | 
			
		||||
	movl	4(%rsi), %r13d
 | 
			
		||||
	vmovsd	144(%rdi), %xmm0        # xmm0 = mem[0],zero
 | 
			
		||||
	vmovsd	%xmm0, 8(%rsp)          # 8-byte Spill
 | 
			
		||||
	vmovsd	40(%rdi), %xmm0         # xmm0 = mem[0],zero
 | 
			
		||||
	vmovsd	%xmm0, (%rsp)           # 8-byte Spill
 | 
			
		||||
	vmovsd	56(%rdi), %xmm0         # xmm0 = mem[0],zero
 | 
			
		||||
	vmovsd	%xmm0, 32(%rsp)         # 8-byte Spill
 | 
			
		||||
	testl	%r13d, %r13d
 | 
			
		||||
	jle	.LBB1_2
 | 
			
		||||
# %bb.1:                                # 
 | 
			
		||||
	movq	64(%r12), %rdi
 | 
			
		||||
	leaq	(,%r13,8), %rax
 | 
			
		||||
	leaq	(%rax,%rax,2), %rdx
 | 
			
		||||
	xorl	%esi, %esi
 | 
			
		||||
	callq	_intel_fast_memset
 | 
			
		||||
.LBB1_2:                                # 
 | 
			
		||||
	xorl	%eax, %eax
 | 
			
		||||
	callq	getTimeStamp
 | 
			
		||||
	vmovsd	%xmm0, 24(%rsp)         # 8-byte Spill
 | 
			
		||||
	movl	$.L.str.1, %edi
 | 
			
		||||
	callq	likwid_markerStartRegion
 | 
			
		||||
	testl	%r13d, %r13d
 | 
			
		||||
	jle	.LBB1_8
 | 
			
		||||
# %bb.3:                                # 
 | 
			
		||||
	vmovsd	8(%rsp), %xmm0          # 8-byte Reload
 | 
			
		||||
                                        # xmm0 = mem[0],zero
 | 
			
		||||
	vmulsd	%xmm0, %xmm0, %xmm12
 | 
			
		||||
	movq	16(%r15), %rax
 | 
			
		||||
	movq	24(%r15), %rcx
 | 
			
		||||
	movq	%rcx, 8(%rsp)           # 8-byte Spill
 | 
			
		||||
	movslq	8(%r15), %rdx
 | 
			
		||||
	movq	16(%r12), %rsi
 | 
			
		||||
	movq	64(%r12), %rdi
 | 
			
		||||
	vmovsd	(%rsp), %xmm0           # 8-byte Reload
 | 
			
		||||
                                        # xmm0 = mem[0],zero
 | 
			
		||||
	vmulsd	.LCPI1_0(%rip), %xmm0, %xmm11
 | 
			
		||||
	movq	16(%rsp), %rcx          # 8-byte Reload
 | 
			
		||||
	vmovdqu	(%rcx), %xmm10
 | 
			
		||||
	shlq	$2, %rdx
 | 
			
		||||
	movq	%rdx, (%rsp)            # 8-byte Spill
 | 
			
		||||
	xorl	%r12d, %r12d
 | 
			
		||||
	jmp	.LBB1_4
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
.LBB1_5:                                # 
 | 
			
		||||
                                        #   in Loop: Header=BB1_4 Depth=1
 | 
			
		||||
	vxorpd	%xmm13, %xmm13, %xmm13
 | 
			
		||||
	movq	%r9, %rdx
 | 
			
		||||
	vxorpd	%xmm9, %xmm9, %xmm9
 | 
			
		||||
	vxorpd	%xmm14, %xmm14, %xmm14
 | 
			
		||||
.LBB1_6:                                # 
 | 
			
		||||
                                        #   in Loop: Header=BB1_4 Depth=1
 | 
			
		||||
	vaddsd	(%rdi,%r15,8), %xmm14, %xmm0
 | 
			
		||||
	vmovsd	%xmm0, (%rdi,%r15,8)
 | 
			
		||||
	vaddsd	(%rdi,%r10,8), %xmm9, %xmm0
 | 
			
		||||
	vmovsd	%xmm0, (%rdi,%r10,8)
 | 
			
		||||
	vaddsd	(%rdi,%r11,8), %xmm13, %xmm0
 | 
			
		||||
	vmovsd	%xmm0, (%rdi,%r11,8)
 | 
			
		||||
	leal	3(%r9), %ecx
 | 
			
		||||
	addl	$6, %r9d
 | 
			
		||||
	testl	%ecx, %ecx
 | 
			
		||||
	cmovnsl	%ecx, %r9d
 | 
			
		||||
	sarl	$2, %r9d
 | 
			
		||||
	movslq	%r9d, %rcx
 | 
			
		||||
	vmovq	%rcx, %xmm0
 | 
			
		||||
	vmovq	%rdx, %xmm1
 | 
			
		||||
	vpunpcklqdq	%xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
 | 
			
		||||
	vpaddq	%xmm0, %xmm10, %xmm10
 | 
			
		||||
	incq	%r12
 | 
			
		||||
	addq	(%rsp), %rax            # 8-byte Folded Reload
 | 
			
		||||
	cmpq	%r13, %r12
 | 
			
		||||
	je	.LBB1_7
 | 
			
		||||
.LBB1_4:                                # 
 | 
			
		||||
                                        # =>This Loop Header: Depth=1
 | 
			
		||||
                                        #     Child Loop BB1_10 Depth 2
 | 
			
		||||
	movq	8(%rsp), %rcx           # 8-byte Reload
 | 
			
		||||
	movslq	(%rcx,%r12,4), %r9
 | 
			
		||||
	leaq	(%r12,%r12,2), %rcx
 | 
			
		||||
	leal	1(%rcx), %r10d
 | 
			
		||||
	leal	2(%rcx), %r11d
 | 
			
		||||
	movl	%ecx, %r15d
 | 
			
		||||
	testq	%r9, %r9
 | 
			
		||||
	jle	.LBB1_5
 | 
			
		||||
# %bb.9:                                # 
 | 
			
		||||
                                        #   in Loop: Header=BB1_4 Depth=1
 | 
			
		||||
	vmovsd	(%rsi,%r15,8), %xmm15   # xmm15 = mem[0],zero
 | 
			
		||||
	vmovsd	(%rsi,%r10,8), %xmm4    # xmm4 = mem[0],zero
 | 
			
		||||
	vmovsd	(%rsi,%r11,8), %xmm1    # xmm1 = mem[0],zero
 | 
			
		||||
	movl	%r9d, %edx
 | 
			
		||||
	vxorpd	%xmm14, %xmm14, %xmm14
 | 
			
		||||
	xorl	%ecx, %ecx
 | 
			
		||||
	vxorpd	%xmm9, %xmm9, %xmm9
 | 
			
		||||
	vxorpd	%xmm13, %xmm13, %xmm13
 | 
			
		||||
	jmp	.LBB1_10
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
.LBB1_13:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB1_10 Depth=2
 | 
			
		||||
	incq	%rcx
 | 
			
		||||
	cmpq	%rcx, %rdx
 | 
			
		||||
	je	.LBB1_6
 | 
			
		||||
.LBB1_10:                               # 
 | 
			
		||||
                                        #   Parent Loop BB1_4 Depth=1
 | 
			
		||||
                                        # =>  This Inner Loop Header: Depth=2
 | 
			
		||||
	movslq	(%rax,%rcx,4), %r8
 | 
			
		||||
	leaq	(%r8,%r8,2), %r14
 | 
			
		||||
	vsubsd	(%rsi,%r14,8), %xmm15, %xmm2
 | 
			
		||||
	movslq	%r14d, %rbp
 | 
			
		||||
	vsubsd	8(%rsi,%rbp,8), %xmm4, %xmm5
 | 
			
		||||
	vsubsd	16(%rsi,%rbp,8), %xmm1, %xmm0
 | 
			
		||||
	vmulsd	%xmm2, %xmm2, %xmm6
 | 
			
		||||
	vfmadd231sd	%xmm5, %xmm5, %xmm6 # xmm6 = (xmm5 * xmm5) + xmm6
 | 
			
		||||
	vfmadd231sd	%xmm0, %xmm0, %xmm6 # xmm6 = (xmm0 * xmm0) + xmm6
 | 
			
		||||
	vucomisd	%xmm12, %xmm6
 | 
			
		||||
	jae	.LBB1_13
 | 
			
		||||
# %bb.11:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB1_10 Depth=2
 | 
			
		||||
	vmovsd	.LCPI1_1(%rip), %xmm3   # xmm3 = mem[0],zero
 | 
			
		||||
	vdivsd	%xmm6, %xmm3, %xmm6
 | 
			
		||||
	vmulsd	32(%rsp), %xmm6, %xmm3  # 8-byte Folded Reload
 | 
			
		||||
	vmulsd	%xmm6, %xmm6, %xmm8
 | 
			
		||||
	vmulsd	%xmm3, %xmm8, %xmm3
 | 
			
		||||
	vaddsd	.LCPI1_2(%rip), %xmm3, %xmm7
 | 
			
		||||
	vmulsd	%xmm6, %xmm11, %xmm6
 | 
			
		||||
	vmulsd	%xmm3, %xmm6, %xmm3
 | 
			
		||||
	vmulsd	%xmm7, %xmm3, %xmm3
 | 
			
		||||
	vmulsd	%xmm2, %xmm3, %xmm6
 | 
			
		||||
	vaddsd	%xmm6, %xmm14, %xmm14
 | 
			
		||||
	vmulsd	%xmm5, %xmm3, %xmm2
 | 
			
		||||
	vaddsd	%xmm2, %xmm9, %xmm9
 | 
			
		||||
	vmulsd	%xmm0, %xmm3, %xmm0
 | 
			
		||||
	vaddsd	%xmm0, %xmm13, %xmm13
 | 
			
		||||
	cmpl	%r13d, %r8d
 | 
			
		||||
	jge	.LBB1_13
 | 
			
		||||
# %bb.12:                               # 
 | 
			
		||||
                                        #   in Loop: Header=BB1_10 Depth=2
 | 
			
		||||
	leaq	1(%rbp), %rbx
 | 
			
		||||
	addq	$2, %rbp
 | 
			
		||||
	vmovsd	(%rdi,%r14,8), %xmm3    # xmm3 = mem[0],zero
 | 
			
		||||
	vsubsd	%xmm6, %xmm3, %xmm3
 | 
			
		||||
	vmovsd	%xmm3, (%rdi,%r14,8)
 | 
			
		||||
	vmovsd	(%rdi,%rbx,8), %xmm3    # xmm3 = mem[0],zero
 | 
			
		||||
	vsubsd	%xmm2, %xmm3, %xmm2
 | 
			
		||||
	vmovsd	%xmm2, (%rdi,%rbx,8)
 | 
			
		||||
	vmovsd	(%rdi,%rbp,8), %xmm2    # xmm2 = mem[0],zero
 | 
			
		||||
	vsubsd	%xmm0, %xmm2, %xmm0
 | 
			
		||||
	vmovsd	%xmm0, (%rdi,%rbp,8)
 | 
			
		||||
	jmp	.LBB1_13
 | 
			
		||||
.LBB1_7:                                # 
 | 
			
		||||
	movq	16(%rsp), %rax          # 8-byte Reload
 | 
			
		||||
	vmovdqu	%xmm10, (%rax)
 | 
			
		||||
.LBB1_8:                                # 
 | 
			
		||||
	movl	$.L.str.1, %edi
 | 
			
		||||
	callq	likwid_markerStopRegion
 | 
			
		||||
	xorl	%eax, %eax
 | 
			
		||||
	callq	getTimeStamp
 | 
			
		||||
	vsubsd	24(%rsp), %xmm0, %xmm0  # 8-byte Folded Reload
 | 
			
		||||
	addq	$40, %rsp
 | 
			
		||||
	.cfi_def_cfa_offset 56
 | 
			
		||||
	popq	%rbx
 | 
			
		||||
	.cfi_def_cfa_offset 48
 | 
			
		||||
	popq	%r12
 | 
			
		||||
	.cfi_def_cfa_offset 40
 | 
			
		||||
	popq	%r13
 | 
			
		||||
	.cfi_def_cfa_offset 32
 | 
			
		||||
	popq	%r14
 | 
			
		||||
	.cfi_def_cfa_offset 24
 | 
			
		||||
	popq	%r15
 | 
			
		||||
	.cfi_def_cfa_offset 16
 | 
			
		||||
	popq	%rbp
 | 
			
		||||
	.cfi_def_cfa_offset 8
 | 
			
		||||
	retq
 | 
			
		||||
.Lfunc_end1:
 | 
			
		||||
	.size	computeForceLJHalfNeigh, .Lfunc_end1-computeForceLJHalfNeigh
 | 
			
		||||
	.cfi_endproc
 | 
			
		||||
                                        # -- End function
 | 
			
		||||
	.globl	computeForceLJFullNeigh_simd # -- Begin function computeForceLJFullNeigh_simd
 | 
			
		||||
	.p2align	4, 0x90
 | 
			
		||||
	.type	computeForceLJFullNeigh_simd,@function
 | 
			
		||||
computeForceLJFullNeigh_simd:           # 
 | 
			
		||||
.LcomputeForceLJFullNeigh_simd$local:
 | 
			
		||||
	.cfi_startproc
 | 
			
		||||
# %bb.0:                                # 
 | 
			
		||||
	pushq	%rax
 | 
			
		||||
	.cfi_def_cfa_offset 16
 | 
			
		||||
	movl	4(%rsi), %eax
 | 
			
		||||
	testl	%eax, %eax
 | 
			
		||||
	jle	.LBB2_2
 | 
			
		||||
# %bb.1:                                # 
 | 
			
		||||
	movq	64(%rsi), %rdi
 | 
			
		||||
	shlq	$3, %rax
 | 
			
		||||
	leaq	(%rax,%rax,2), %rdx
 | 
			
		||||
	xorl	%esi, %esi
 | 
			
		||||
	callq	_intel_fast_memset
 | 
			
		||||
.LBB2_2:                                # 
 | 
			
		||||
	xorl	%eax, %eax
 | 
			
		||||
	callq	getTimeStamp
 | 
			
		||||
	movl	$.L.str, %edi
 | 
			
		||||
	callq	likwid_markerStartRegion
 | 
			
		||||
	movq	stderr(%rip), %rcx
 | 
			
		||||
	movl	$.L.str.2, %edi
 | 
			
		||||
	movl	$65, %esi
 | 
			
		||||
	movl	$1, %edx
 | 
			
		||||
	callq	fwrite
 | 
			
		||||
	movl	$-1, %edi
 | 
			
		||||
	callq	exit
 | 
			
		||||
.Lfunc_end2:
 | 
			
		||||
	.size	computeForceLJFullNeigh_simd, .Lfunc_end2-computeForceLJFullNeigh_simd
 | 
			
		||||
	.cfi_endproc
 | 
			
		||||
                                        # -- End function
 | 
			
		||||
	.type	.L.str,@object          # 
 | 
			
		||||
	.section	.rodata.str1.1,"aMS",@progbits,1
 | 
			
		||||
.L.str:
 | 
			
		||||
	.asciz	"force"
 | 
			
		||||
	.size	.L.str, 6
 | 
			
		||||
 | 
			
		||||
	.type	.L.str.1,@object        # 
 | 
			
		||||
.L.str.1:
 | 
			
		||||
	.asciz	"forceLJ-halfneigh"
 | 
			
		||||
	.size	.L.str.1, 18
 | 
			
		||||
 | 
			
		||||
	.type	.L.str.2,@object        # 
 | 
			
		||||
.L.str.2:
 | 
			
		||||
	.asciz	"Error: SIMD kernel not implemented for specified instruction set!"
 | 
			
		||||
	.size	.L.str.2, 66
 | 
			
		||||
 | 
			
		||||
	.ident	"Intel(R) oneAPI DPC++ Compiler 2021.1-beta05 (2020.2.0.0304)"
 | 
			
		||||
	.section	".note.GNU-stack","",@progbits
 | 
			
		||||
@@ -1,112 +1,46 @@
 | 
			
		||||
#!/bin/bash
 | 
			
		||||
 | 
			
		||||
[[ -z "$1" ]] && echo "Use: $0 <binary> [-c <core>] [-f <freq>] [-n <nruns>] [-l <log>] [-s]" && exit
 | 
			
		||||
[[ ! -f "$1" ]] && echo "Binary file not found, make sure to use 'make'" && exit
 | 
			
		||||
[[ ! -f "$1-stub" ]] && echo "Binary file for stubbed case not found, make sure to use 'make VARIANT=stub'" && exit
 | 
			
		||||
TAG=ICX
 | 
			
		||||
OPT_SCHEME=gromacs
 | 
			
		||||
MDBENCH_BIN=./MDBench-$TAG-$OPT_SCHEME
 | 
			
		||||
FREQ=2.4
 | 
			
		||||
NRUNS=3
 | 
			
		||||
FIXED_PARAMS=--freq $FREQ
 | 
			
		||||
 | 
			
		||||
MDBENCH_BIN=$1
 | 
			
		||||
BIN_INFO="${MDBENCH_BIN#*-}" # $OPT_SCHEME-$TAG-$ISA-$PREC
 | 
			
		||||
OPT_SCHEME="${BIN_INFO%%-*}"
 | 
			
		||||
PREC="${BIN_INFO##*-}"
 | 
			
		||||
BIN_INFO="${BIN_INFO#*-}" # $TAG-$ISA-$PREC
 | 
			
		||||
BIN_INFO="${BIN_INFO%-*}" # $TAG-$ISA
 | 
			
		||||
TAG="${BIN_INFO%%-*}"
 | 
			
		||||
ISA="${BIN_INFO##*-}"
 | 
			
		||||
CORE="${CORE:-0}"
 | 
			
		||||
FREQ="${FREQ:-2.4}"
 | 
			
		||||
NRUNS="${NRUNS:-3}"
 | 
			
		||||
LOG="${LOG:-latencies_and_cfds.log}"
 | 
			
		||||
STUB_ONLY="${STUB_ONLY:-false}"
 | 
			
		||||
SKIP_SET_FREQ="${SKIP_SET_FREQ:-false}"
 | 
			
		||||
 | 
			
		||||
OPTIND=2
 | 
			
		||||
while getopts "c:f:n:l:s" flag; do
 | 
			
		||||
    case "${flag}" in
 | 
			
		||||
        c) CORE=${OPTARG};;
 | 
			
		||||
        f) FREQ=${OPTARG};;
 | 
			
		||||
        n) NRUNS=${OPTARG};;
 | 
			
		||||
        l) LOG=${OPTARG};;
 | 
			
		||||
        s) STUB_ONLY=true;;
 | 
			
		||||
    esac
 | 
			
		||||
done
 | 
			
		||||
 | 
			
		||||
# Other useful variables
 | 
			
		||||
MDBENCH_BIN=./MDBench-$OPT_SCHEME-$TAG-$ISA-$PREC
 | 
			
		||||
FIXED_PARAMS="--freq $FREQ"
 | 
			
		||||
CPU_VENDOR=$(lscpu | grep "Vendor ID" | tr -s ' ' | cut -d ' ' -f3)
 | 
			
		||||
 | 
			
		||||
if [ "$CPU_VENDOR" == "GenuineIntel" ]; then
 | 
			
		||||
    ALL_PREFETCHERS="HW_PREFETCHER,CL_PREFETCHER,DCU_PREFETCHER,IP_PREFETCHER"
 | 
			
		||||
    PREFETCHERS=("ALL HW_PREFETCHER CL_PREFETCHER DCU_PREFETCHER IP_PREFETCHER NONE")
 | 
			
		||||
if [ "$OPT_SCHEME" = "gromacs" ]; then
 | 
			
		||||
    STUB1_NAME=Stub-33
 | 
			
		||||
    STUB1_PARAMS=-na 4 -nn 33
 | 
			
		||||
    STUB2_NAME=Stub-128
 | 
			
		||||
    STUB2_PARAMS=-na 4 -nn 128
 | 
			
		||||
else
 | 
			
		||||
    ALL_PREFETCHERS=""
 | 
			
		||||
    PREFETCHERS=("IGNORE")
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
if [ "$OPT_SCHEME" == "gromacs" ]; then
 | 
			
		||||
    STUB1_NAME=stub-33
 | 
			
		||||
    STUB1_PARAMS="-na 4 -nn 33"
 | 
			
		||||
    STUB2_NAME=stub-128
 | 
			
		||||
    STUB2_PARAMS="-na 4 -nn 128"
 | 
			
		||||
else
 | 
			
		||||
    STUB1_NAME=stub-76
 | 
			
		||||
    STUB1_PARAMS="-nn 76"
 | 
			
		||||
    STUB2_NAME=stub-1024
 | 
			
		||||
    STUB2_PARAMS="-nn 1024"
 | 
			
		||||
    STUB1_NAME=Stub-76
 | 
			
		||||
    STUB1_PARAMS=-nn 76
 | 
			
		||||
    STUB2_NAME=Stub-1024
 | 
			
		||||
    STUB2_PARAMS=-nn 1024
 | 
			
		||||
fi
 | 
			
		||||
 | 
			
		||||
function run_benchmark() {
 | 
			
		||||
    BEST=10000000
 | 
			
		||||
    for i in $(seq $NRUNS); do
 | 
			
		||||
        RES=$(likwid-pin -c $CORE "$* $FIXED_PARAMS" 2>&1 | grep "Cycles/SIMD iteration" | cut -d ' ' -f3)
 | 
			
		||||
        if (( $(echo "$BEST > $RES" | bc -l ) )); then
 | 
			
		||||
            BEST=$RES
 | 
			
		||||
        fi
 | 
			
		||||
        likwid-pin -c 0 "$* $FIXED_PARAMS" 2>&1 | grep "Cycles/SIMD iteration" | cut -d ' ' -f3
 | 
			
		||||
    done
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
echo "Tag: $TAG" | tee -a $LOG
 | 
			
		||||
echo "Optimization scheme: $OPT_SCHEME" | tee -a $LOG
 | 
			
		||||
echo "Instruction set: $ISA" | tee -a $LOG
 | 
			
		||||
echo "Precision: $PREC" | tee -a $LOG
 | 
			
		||||
echo "Binary: $MDBENCH_BIN(-stub)" | tee -a $LOG
 | 
			
		||||
echo "Frequency: $FREQ" | tee -a $LOG
 | 
			
		||||
echo "Number of runs: $NRUNS" | tee -a $LOG
 | 
			
		||||
echo "Run only stubbed cases: $STUB_ONLY" | tee -a $LOG
 | 
			
		||||
echo "Tag: $TAG"
 | 
			
		||||
echo "Optimization scheme: $OPT_SCHEME"
 | 
			
		||||
echo "Binary: $MDBENCH_BIN(-stub)"
 | 
			
		||||
echo "Frequency: $FREQ"
 | 
			
		||||
echo "Number of runs: $NRUNS"
 | 
			
		||||
 | 
			
		||||
if [ "$SKIP_SET_FREQ" == "false" ]; then
 | 
			
		||||
    echo "Fixing frequencies..."
 | 
			
		||||
    likwid-setFrequencies -f $FREQ -t 0
 | 
			
		||||
fi
 | 
			
		||||
echo "Fixing frequencies..."
 | 
			
		||||
likwid-setFrequencies -f $FREQ -t 0
 | 
			
		||||
 | 
			
		||||
for p in $PREFETCHERS; do
 | 
			
		||||
    if [ "$p" != "IGNORE" ]; then
 | 
			
		||||
        if [ "$p" == "ALL" ]; then
 | 
			
		||||
            likwid-features -c $CORE -e $ALL_PREFETCHERS
 | 
			
		||||
        elif [ "$p" == "NONE" ]; then
 | 
			
		||||
            likwid-features -c $CORE -d $ALL_PREFETCHERS
 | 
			
		||||
        else
 | 
			
		||||
            likwid-features -c $CORE -d $ALL_PREFETCHERS
 | 
			
		||||
            likwid-features -c $CORE -e $p
 | 
			
		||||
        fi
 | 
			
		||||
 | 
			
		||||
        echo "Prefetcher settings: $p"
 | 
			
		||||
        likwid-features -c $CORE -l
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    MSG="$p: "
 | 
			
		||||
    if [ "$STUB_ONLY" == "false" ]; then
 | 
			
		||||
        run_benchmark $MDBENCH_BIN
 | 
			
		||||
        MSG+="standard=$BEST, "
 | 
			
		||||
        run_benchmark $MDBENCH_BIN -i data/copper_melting/input_lj_cu_one_atomtype_20x20x20.dmp
 | 
			
		||||
        MSG+="melt=$BEST, "
 | 
			
		||||
        run_benchmark $MDBENCH_BIN -p data/argon_1000/mdbench_params.conf -i data/argon_1000/tprout.gro
 | 
			
		||||
        MSG+="argon=$BEST, "
 | 
			
		||||
    fi
 | 
			
		||||
 | 
			
		||||
    run_benchmark $MDBENCH_BIN-stub $STUB1_PARAMS
 | 
			
		||||
    MSG+="$STUB1_NAME=$BEST, "
 | 
			
		||||
    run_benchmark $MDBENCH_BIN-stub $STUB2_PARAMS
 | 
			
		||||
    MSG+="$STUB2_NAME=$BEST"
 | 
			
		||||
    echo $MSG | tee -a $LOG
 | 
			
		||||
done
 | 
			
		||||
echo "Standard"
 | 
			
		||||
run_benchmark $MDBENCH_BIN
 | 
			
		||||
echo "Melt"
 | 
			
		||||
run_benchmark $MDBENCH_BIN -i data/copper_melting/input_lj_cu_one_atomtype_20x20x20.dmp
 | 
			
		||||
echo "Argon"
 | 
			
		||||
run_benchmark $MDBENCH_BIN -p data/argon_1000/mdbench_params.conf -i data/argon_1000/tprout.gro
 | 
			
		||||
echo "$STUB1_NAME"
 | 
			
		||||
run_benchmark $MDBENCH_BIN-stub $STUB1_PARAMS
 | 
			
		||||
echo "$STUB2_NAME"
 | 
			
		||||
run_benchmark $MDBENCH_BIN-stub $STUB2_PARAMS
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user