Add first compilable version of Gromacs with SP
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
This commit is contained in:
parent
8669f2f6d7
commit
d61576699d
15
Makefile
15
Makefile
@ -17,9 +17,6 @@ INCLUDES += -I./$(SRC_DIR)/includes
|
|||||||
ifeq ($(strip $(DATA_LAYOUT)),AOS)
|
ifeq ($(strip $(DATA_LAYOUT)),AOS)
|
||||||
DEFINES += -DAOS
|
DEFINES += -DAOS
|
||||||
endif
|
endif
|
||||||
ifeq ($(strip $(CLUSTER_LAYOUT)),AOS)
|
|
||||||
DEFINES += -DCLUSTER_AOS
|
|
||||||
endif
|
|
||||||
ifeq ($(strip $(DATA_TYPE)),SP)
|
ifeq ($(strip $(DATA_TYPE)),SP)
|
||||||
DEFINES += -DPRECISION=1
|
DEFINES += -DPRECISION=1
|
||||||
else
|
else
|
||||||
@ -30,14 +27,6 @@ ifneq ($(ASM_SYNTAX), ATT)
|
|||||||
ASFLAGS += -masm=intel
|
ASFLAGS += -masm=intel
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifneq ($(ATOMS_LOOP_RUNS),)
|
|
||||||
DEFINES += -DATOMS_LOOP_RUNS=$(ATOMS_LOOP_RUNS)
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifneq ($(NEIGHBORS_LOOP_RUNS),)
|
|
||||||
DEFINES += -DNEIGHBORS_LOOP_RUNS=$(NEIGHBORS_LOOP_RUNS)
|
|
||||||
endif
|
|
||||||
|
|
||||||
ifeq ($(strip $(EXPLICIT_TYPES)),true)
|
ifeq ($(strip $(EXPLICIT_TYPES)),true)
|
||||||
DEFINES += -DEXPLICIT_TYPES
|
DEFINES += -DEXPLICIT_TYPES
|
||||||
endif
|
endif
|
||||||
@ -74,6 +63,10 @@ ifeq ($(strip $(NO_AVX2)),true)
|
|||||||
DEFINES += -DNO_AVX2
|
DEFINES += -DNO_AVX2
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(strip $(AVX512)),true)
|
||||||
|
DEFINES += -DAVX512
|
||||||
|
endif
|
||||||
|
|
||||||
VPATH = $(SRC_DIR) $(ASM_DIR)
|
VPATH = $(SRC_DIR) $(ASM_DIR)
|
||||||
ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
|
ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
|
||||||
OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s))
|
OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s))
|
||||||
|
@ -7,7 +7,7 @@ OPT_SCHEME ?= gromacs
|
|||||||
# Enable likwid (true or false)
|
# Enable likwid (true or false)
|
||||||
ENABLE_LIKWID ?= true
|
ENABLE_LIKWID ?= true
|
||||||
# SP or DP
|
# SP or DP
|
||||||
DATA_TYPE ?= DP
|
DATA_TYPE ?= SP
|
||||||
# AOS or SOA
|
# AOS or SOA
|
||||||
DATA_LAYOUT ?= AOS
|
DATA_LAYOUT ?= AOS
|
||||||
# Assembly syntax to generate (ATT/INTEL)
|
# Assembly syntax to generate (ATT/INTEL)
|
||||||
@ -15,10 +15,6 @@ ASM_SYNTAX ?= ATT
|
|||||||
# Debug
|
# Debug
|
||||||
DEBUG ?= false
|
DEBUG ?= false
|
||||||
|
|
||||||
# Number of times to run the atoms loop on stubbed variant
|
|
||||||
ATOMS_LOOP_RUNS ?= 1
|
|
||||||
# Number of times to run the neighbors loop on stubbed variant
|
|
||||||
NEIGHBORS_LOOP_RUNS ?= 1
|
|
||||||
# Explicitly store and load atom types (true or false)
|
# Explicitly store and load atom types (true or false)
|
||||||
EXPLICIT_TYPES ?= false
|
EXPLICIT_TYPES ?= false
|
||||||
# Trace memory addresses for cache simulator (true or false)
|
# Trace memory addresses for cache simulator (true or false)
|
||||||
@ -29,8 +25,6 @@ INDEX_TRACER ?= false
|
|||||||
COMPUTE_STATS ?= true
|
COMPUTE_STATS ?= true
|
||||||
|
|
||||||
# Configurations for gromacs optimization scheme
|
# Configurations for gromacs optimization scheme
|
||||||
# AOS or SOA
|
|
||||||
CLUSTER_LAYOUT ?= SOA
|
|
||||||
# Use reference version
|
# Use reference version
|
||||||
USE_REFERENCE_VERSION ?= false
|
USE_REFERENCE_VERSION ?= false
|
||||||
# Enable XTC output
|
# Enable XTC output
|
||||||
|
@ -128,7 +128,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
|||||||
}
|
}
|
||||||
|
|
||||||
double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||||
DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
|
DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
|
||||||
int Nlocal = atom->Nlocal;
|
int Nlocal = atom->Nlocal;
|
||||||
int* neighs;
|
int* neighs;
|
||||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||||
@ -136,10 +136,10 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta
|
|||||||
MD_FLOAT epsilon = param->epsilon;
|
MD_FLOAT epsilon = param->epsilon;
|
||||||
MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
|
MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
|
||||||
MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
|
MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
|
||||||
MD_SIMD_FLOAT epsilon_vec = simd_broadcast(epsilon);
|
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
|
||||||
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
||||||
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
||||||
const int unroll_j = VECTOR_WIDTH / CLUSTER_N;
|
const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
|
||||||
|
|
||||||
double S = getTimeStamp();
|
double S = getTimeStamp();
|
||||||
LIKWID_MARKER_START("force");
|
LIKWID_MARKER_START("force");
|
||||||
@ -152,131 +152,86 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta
|
|||||||
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||||
int numneighs = neighbor->numneigh[ci];
|
int numneighs = neighbor->numneigh[ci];
|
||||||
|
|
||||||
MD_SIMD_FLOAT xi0_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 0]);
|
MD_SIMD_FLOAT xi0_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 0]);
|
||||||
MD_SIMD_FLOAT xi1_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 1]);
|
MD_SIMD_FLOAT xi2_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 2]);
|
||||||
MD_SIMD_FLOAT xi2_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 2]);
|
MD_SIMD_FLOAT yi0_tmp = simd_load_h_dual(&ci_x[CL_Y_OFFSET + 0]);
|
||||||
MD_SIMD_FLOAT xi3_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 3]);
|
MD_SIMD_FLOAT yi2_tmp = simd_load_h_dual(&ci_x[CL_Y_OFFSET + 2]);
|
||||||
MD_SIMD_FLOAT yi0_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 0]);
|
MD_SIMD_FLOAT zi0_tmp = simd_load_h_dual(&ci_x[CL_Z_OFFSET + 0]);
|
||||||
MD_SIMD_FLOAT yi1_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 1]);
|
MD_SIMD_FLOAT zi2_tmp = simd_load_h_dual(&ci_x[CL_Z_OFFSET + 2]);
|
||||||
MD_SIMD_FLOAT yi2_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 2]);
|
|
||||||
MD_SIMD_FLOAT yi3_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 3]);
|
|
||||||
MD_SIMD_FLOAT zi0_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 0]);
|
|
||||||
MD_SIMD_FLOAT zi1_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 1]);
|
|
||||||
MD_SIMD_FLOAT zi2_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 2]);
|
|
||||||
MD_SIMD_FLOAT zi3_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 3]);
|
|
||||||
MD_SIMD_FLOAT fix0 = simd_zero();
|
MD_SIMD_FLOAT fix0 = simd_zero();
|
||||||
MD_SIMD_FLOAT fiy0 = simd_zero();
|
MD_SIMD_FLOAT fiy0 = simd_zero();
|
||||||
MD_SIMD_FLOAT fiz0 = simd_zero();
|
MD_SIMD_FLOAT fiz0 = simd_zero();
|
||||||
MD_SIMD_FLOAT fix1 = simd_zero();
|
|
||||||
MD_SIMD_FLOAT fiy1 = simd_zero();
|
|
||||||
MD_SIMD_FLOAT fiz1 = simd_zero();
|
|
||||||
MD_SIMD_FLOAT fix2 = simd_zero();
|
MD_SIMD_FLOAT fix2 = simd_zero();
|
||||||
MD_SIMD_FLOAT fiy2 = simd_zero();
|
MD_SIMD_FLOAT fiy2 = simd_zero();
|
||||||
MD_SIMD_FLOAT fiz2 = simd_zero();
|
MD_SIMD_FLOAT fiz2 = simd_zero();
|
||||||
MD_SIMD_FLOAT fix3 = simd_zero();
|
|
||||||
MD_SIMD_FLOAT fiy3 = simd_zero();
|
|
||||||
MD_SIMD_FLOAT fiz3 = simd_zero();
|
|
||||||
|
|
||||||
for(int k = 0; k < numneighs; k += unroll_j) {
|
for(int k = 0; k < numneighs; k += 2) {
|
||||||
int cj = neighs[k + 0];
|
int cj = neighs[k];
|
||||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||||
unsigned int cond0 = (unsigned int)(ci == cj);
|
unsigned int mask0, mask1, mask2, mask3;
|
||||||
unsigned int cond1 = cond0;
|
|
||||||
MD_SIMD_FLOAT xj_tmp, yj_tmp, zj_tmp;
|
|
||||||
|
|
||||||
/*
|
|
||||||
MD_FLOAT *cj0_ptr = cluster_pos_ptr(cj0);
|
|
||||||
#if VECTOR_WIDTH == 8
|
|
||||||
int cj1 = neighs[k + 1];
|
|
||||||
unsigned int cond1 = (unsigned int)(ci == cj1);
|
|
||||||
MD_FLOAT *cj1_ptr = cluster_pos_ptr(cj1);
|
|
||||||
MD_SIMD_FLOAT xj_tmp = simd_load2(cj0_ptr, cj1_ptr, 0);
|
|
||||||
MD_SIMD_FLOAT yj_tmp = simd_load2(cj0_ptr, cj1_ptr, 1);
|
|
||||||
MD_SIMD_FLOAT zj_tmp = simd_load2(cj0_ptr, cj1_ptr, 2);
|
|
||||||
#else
|
|
||||||
MD_SIMD_FLOAT xj_tmp = simd_load(cj0_ptr, 0);
|
|
||||||
MD_SIMD_FLOAT yj_tmp = simd_load(cj0_ptr, 1);
|
|
||||||
MD_SIMD_FLOAT zj_tmp = simd_load(cj0_ptr, 2);
|
|
||||||
#endif
|
|
||||||
*/
|
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
|
||||||
|
MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
|
||||||
|
MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
|
||||||
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
|
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
|
||||||
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
|
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
|
||||||
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
|
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
|
||||||
MD_SIMD_FLOAT delx1 = simd_sub(xi1_tmp, xj_tmp);
|
|
||||||
MD_SIMD_FLOAT dely1 = simd_sub(yi1_tmp, yj_tmp);
|
|
||||||
MD_SIMD_FLOAT delz1 = simd_sub(zi1_tmp, zj_tmp);
|
|
||||||
MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
|
MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
|
||||||
MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
|
MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
|
||||||
MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
|
MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
|
||||||
MD_SIMD_FLOAT delx3 = simd_sub(xi3_tmp, xj_tmp);
|
|
||||||
MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp);
|
|
||||||
MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp);
|
|
||||||
|
|
||||||
#if VECTOR_WIDTH == 8
|
#if CLUSTER_M == CLUSTER_N
|
||||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1));
|
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1));
|
mask0 = (unsigned int)(0xf - 0x1 * cond0);
|
||||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1));
|
mask1 = (unsigned int)(0xf - 0x2 * cond0);
|
||||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1));
|
mask2 = (unsigned int)(0xf - 0x4 * cond0);
|
||||||
|
mask3 = (unsigned int)(0xf - 0x8 * cond0);
|
||||||
|
#elif CLUSTER_M < CLUSTER_N
|
||||||
|
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
|
||||||
|
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
|
||||||
|
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
|
||||||
|
mask1 = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
|
||||||
|
mask2 = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
|
||||||
|
mask3 = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
|
||||||
#else
|
#else
|
||||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
|
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0));
|
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
|
||||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0));
|
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
|
||||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0));
|
mask1 = (unsigned int)(0x3 - 0x2 * cond0);
|
||||||
|
mask2 = (unsigned int)(0x3 - 0x1 * cond1);
|
||||||
|
mask3 = (unsigned int)(0x3 - 0x2 * cond1);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0);
|
||||||
|
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2);
|
||||||
|
|
||||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
|
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
|
||||||
MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, simd_mul(delz1, delz1)));
|
|
||||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
|
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
|
||||||
MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, simd_mul(delz3, delz3)));
|
|
||||||
|
|
||||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
|
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
|
||||||
MD_SIMD_MASK cutoff_mask1 = simd_mask_and(excl_mask1, simd_mask_cond_lt(rsq1, cutforcesq_vec));
|
|
||||||
MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
|
MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
|
||||||
MD_SIMD_MASK cutoff_mask3 = simd_mask_and(excl_mask3, simd_mask_cond_lt(rsq3, cutforcesq_vec));
|
|
||||||
|
|
||||||
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
||||||
MD_SIMD_FLOAT sr2_1 = simd_reciprocal(rsq1);
|
|
||||||
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||||
MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
|
|
||||||
|
|
||||||
MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
|
MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
|
||||||
MD_SIMD_FLOAT sr6_1 = simd_mul(sr2_1, simd_mul(sr2_1, simd_mul(sr2_1, sigma6_vec)));
|
|
||||||
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
|
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
|
||||||
MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec)));
|
|
||||||
|
|
||||||
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, epsilon_vec))));
|
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
|
||||||
MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, epsilon_vec))));
|
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
|
||||||
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, epsilon_vec))));
|
|
||||||
MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, epsilon_vec))));
|
|
||||||
|
|
||||||
fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
|
fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
|
||||||
fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
|
fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
|
||||||
fiz0 = simd_masked_add(fiz0, simd_mul(delz0, force0), cutoff_mask0);
|
fiz0 = simd_masked_add(fiz0, simd_mul(delz0, force0), cutoff_mask0);
|
||||||
fix1 = simd_masked_add(fix1, simd_mul(delx1, force1), cutoff_mask1);
|
|
||||||
fiy1 = simd_masked_add(fiy1, simd_mul(dely1, force1), cutoff_mask1);
|
|
||||||
fiz1 = simd_masked_add(fiz1, simd_mul(delz1, force1), cutoff_mask1);
|
|
||||||
fix2 = simd_masked_add(fix2, simd_mul(delx2, force2), cutoff_mask2);
|
fix2 = simd_masked_add(fix2, simd_mul(delx2, force2), cutoff_mask2);
|
||||||
fiy2 = simd_masked_add(fiy2, simd_mul(dely2, force2), cutoff_mask2);
|
fiy2 = simd_masked_add(fiy2, simd_mul(dely2, force2), cutoff_mask2);
|
||||||
fiz2 = simd_masked_add(fiz2, simd_mul(delz2, force2), cutoff_mask2);
|
fiz2 = simd_masked_add(fiz2, simd_mul(delz2, force2), cutoff_mask2);
|
||||||
fix3 = simd_masked_add(fix3, simd_mul(delx3, force3), cutoff_mask3);
|
|
||||||
fiy3 = simd_masked_add(fiy3, simd_mul(dely3, force3), cutoff_mask3);
|
|
||||||
fiz3 = simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ci_f[CL_X_OFFSET + 0] = simd_horizontal_sum(fix0);
|
simd_h_dual_reduce_sum(&ci_f[CL_X_OFFSET], fix0, fix2);
|
||||||
ci_f[CL_X_OFFSET + 1] = simd_horizontal_sum(fix1);
|
simd_h_dual_reduce_sum(&ci_f[CL_Y_OFFSET], fiy0, fiy2);
|
||||||
ci_f[CL_X_OFFSET + 2] = simd_horizontal_sum(fix2);
|
simd_h_dual_reduce_sum(&ci_f[CL_Z_OFFSET], fiz0, fiz2);
|
||||||
ci_f[CL_X_OFFSET + 3] = simd_horizontal_sum(fix3);
|
|
||||||
ci_f[CL_Y_OFFSET + 0] = simd_horizontal_sum(fiy0);
|
|
||||||
ci_f[CL_Y_OFFSET + 1] = simd_horizontal_sum(fiy1);
|
|
||||||
ci_f[CL_Y_OFFSET + 2] = simd_horizontal_sum(fiy2);
|
|
||||||
ci_f[CL_Y_OFFSET + 3] = simd_horizontal_sum(fiy3);
|
|
||||||
ci_f[CL_Z_OFFSET + 0] = simd_horizontal_sum(fiz0);
|
|
||||||
ci_f[CL_Z_OFFSET + 1] = simd_horizontal_sum(fiz1);
|
|
||||||
ci_f[CL_Z_OFFSET + 2] = simd_horizontal_sum(fiz2);
|
|
||||||
ci_f[CL_Z_OFFSET + 3] = simd_horizontal_sum(fiz3);
|
|
||||||
|
|
||||||
addStat(stats->calculated_forces, 1);
|
addStat(stats->calculated_forces, 1);
|
||||||
addStat(stats->num_neighs, numneighs);
|
addStat(stats->num_neighs, numneighs);
|
||||||
@ -285,7 +240,7 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta
|
|||||||
|
|
||||||
LIKWID_MARKER_STOP("force");
|
LIKWID_MARKER_STOP("force");
|
||||||
double E = getTimeStamp();
|
double E = getTimeStamp();
|
||||||
DEBUG_MESSAGE("computeForceLJ_4xn end\n");
|
DEBUG_MESSAGE("computeForceLJ_2xnn end\n");
|
||||||
return E-S;
|
return E-S;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -298,7 +253,7 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
|||||||
MD_FLOAT epsilon = param->epsilon;
|
MD_FLOAT epsilon = param->epsilon;
|
||||||
MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
|
MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
|
||||||
MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
|
MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
|
||||||
MD_SIMD_FLOAT epsilon_vec = simd_broadcast(epsilon);
|
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
|
||||||
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
||||||
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
||||||
double S = getTimeStamp();
|
double S = getTimeStamp();
|
||||||
@ -348,7 +303,6 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
|||||||
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
|
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
|
||||||
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
|
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
|
||||||
MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
|
MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
|
||||||
|
|
||||||
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
|
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
|
||||||
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
|
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
|
||||||
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
|
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
|
||||||
@ -363,14 +317,14 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
|||||||
MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp);
|
MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp);
|
||||||
|
|
||||||
#if CLUSTER_M == CLUSTER_N
|
#if CLUSTER_M == CLUSTER_N
|
||||||
int cond0 = (unsigned int)(cj == ci_cj0);
|
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
|
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
|
||||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0));
|
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0));
|
||||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0));
|
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0));
|
||||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0));
|
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0));
|
||||||
#elif CLUSTER_M < CLUSTER_N
|
#elif CLUSTER_M < CLUSTER_N
|
||||||
int cond0 = (unsigned int)((cj << 1) + 0 == ci);
|
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
|
||||||
int cond1 = (unsigned int)((cj << 1) + 1 == ci);
|
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
|
||||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1));
|
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1));
|
||||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1));
|
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1));
|
||||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1));
|
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1));
|
||||||
@ -404,10 +358,10 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
|||||||
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
|
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
|
||||||
MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec)));
|
MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec)));
|
||||||
|
|
||||||
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, epsilon_vec))));
|
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
|
||||||
MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, epsilon_vec))));
|
MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, eps_vec))));
|
||||||
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, epsilon_vec))));
|
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
|
||||||
MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, epsilon_vec))));
|
MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, eps_vec))));
|
||||||
|
|
||||||
fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
|
fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
|
||||||
fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
|
fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
|
||||||
@ -423,18 +377,18 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
|||||||
fiz3 = simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3);
|
fiz3 = simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3);
|
||||||
}
|
}
|
||||||
|
|
||||||
ci_f[CL_X_OFFSET + 0] = simd_horizontal_sum(fix0);
|
ci_f[CL_X_OFFSET + 0] = simd_h_reduce_sum(fix0);
|
||||||
ci_f[CL_X_OFFSET + 1] = simd_horizontal_sum(fix1);
|
ci_f[CL_X_OFFSET + 1] = simd_h_reduce_sum(fix1);
|
||||||
ci_f[CL_X_OFFSET + 2] = simd_horizontal_sum(fix2);
|
ci_f[CL_X_OFFSET + 2] = simd_h_reduce_sum(fix2);
|
||||||
ci_f[CL_X_OFFSET + 3] = simd_horizontal_sum(fix3);
|
ci_f[CL_X_OFFSET + 3] = simd_h_reduce_sum(fix3);
|
||||||
ci_f[CL_Y_OFFSET + 0] = simd_horizontal_sum(fiy0);
|
ci_f[CL_Y_OFFSET + 0] = simd_h_reduce_sum(fiy0);
|
||||||
ci_f[CL_Y_OFFSET + 1] = simd_horizontal_sum(fiy1);
|
ci_f[CL_Y_OFFSET + 1] = simd_h_reduce_sum(fiy1);
|
||||||
ci_f[CL_Y_OFFSET + 2] = simd_horizontal_sum(fiy2);
|
ci_f[CL_Y_OFFSET + 2] = simd_h_reduce_sum(fiy2);
|
||||||
ci_f[CL_Y_OFFSET + 3] = simd_horizontal_sum(fiy3);
|
ci_f[CL_Y_OFFSET + 3] = simd_h_reduce_sum(fiy3);
|
||||||
ci_f[CL_Z_OFFSET + 0] = simd_horizontal_sum(fiz0);
|
ci_f[CL_Z_OFFSET + 0] = simd_h_reduce_sum(fiz0);
|
||||||
ci_f[CL_Z_OFFSET + 1] = simd_horizontal_sum(fiz1);
|
ci_f[CL_Z_OFFSET + 1] = simd_h_reduce_sum(fiz1);
|
||||||
ci_f[CL_Z_OFFSET + 2] = simd_horizontal_sum(fiz2);
|
ci_f[CL_Z_OFFSET + 2] = simd_h_reduce_sum(fiz2);
|
||||||
ci_f[CL_Z_OFFSET + 3] = simd_horizontal_sum(fiz3);
|
ci_f[CL_Z_OFFSET + 3] = simd_h_reduce_sum(fiz3);
|
||||||
|
|
||||||
addStat(stats->calculated_forces, 1);
|
addStat(stats->calculated_forces, 1);
|
||||||
addStat(stats->num_neighs, numneighs);
|
addStat(stats->num_neighs, numneighs);
|
||||||
|
@ -21,6 +21,7 @@
|
|||||||
* =======================================================================================
|
* =======================================================================================
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
#include <zmmintrin.h>
|
#include <zmmintrin.h>
|
||||||
@ -28,7 +29,9 @@
|
|||||||
#define SIMD_PRINT_REAL(a) simd_print_real(#a, a);
|
#define SIMD_PRINT_REAL(a) simd_print_real(#a, a);
|
||||||
#define SIMD_PRINT_MASK(a) simd_print_mask(#a, a);
|
#define SIMD_PRINT_MASK(a) simd_print_mask(#a, a);
|
||||||
|
|
||||||
#if VECTOR_WIDTH == 8 // AVX512
|
#ifdef AVX512
|
||||||
|
|
||||||
|
#if PRECISION == 2 // Double precision
|
||||||
|
|
||||||
#define MD_SIMD_FLOAT __m512d
|
#define MD_SIMD_FLOAT __m512d
|
||||||
#define MD_SIMD_MASK __mmask8
|
#define MD_SIMD_MASK __mmask8
|
||||||
@ -47,15 +50,98 @@ static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_m
|
|||||||
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
|
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
|
||||||
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_pd(p); }
|
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_pd(p); }
|
||||||
|
|
||||||
static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
|
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
|
||||||
MD_SIMD_FLOAT x = _mm512_add_pd(a, _mm512_shuffle_f64x2(a, a, 0xee));
|
MD_SIMD_FLOAT x = _mm512_add_pd(a, _mm512_shuffle_f64x2(a, a, 0xee));
|
||||||
x = _mm512_add_pd(x, _mm512_shuffle_f64x2(x, x, 0x11));
|
x = _mm512_add_pd(x, _mm512_shuffle_f64x2(x, x, 0x11));
|
||||||
x = _mm512_add_pd(x, _mm512_permute_pd(x, 0x01));
|
x = _mm512_add_pd(x, _mm512_permute_pd(x, 0x01));
|
||||||
return *((MD_FLOAT *) &x);
|
return *((MD_FLOAT *) &x);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const double* m) {
|
||||||
|
return _mm512_broadcast_f64x4(_mm256_load_pd(m));
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline MD_SIMD_FLOAT simd_load_h_dual(const double* m) {
|
||||||
|
return _mm512_insertf64x4(_mm512_broadcastsd_pd(_mm_load_sd(m)), _mm256_broadcastsd_pd(_mm_load_sd(m + 1)), 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline double simd_h_dual_reduce_sum(double* m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
|
||||||
|
__m512d t0;
|
||||||
|
__m256d t2, t3;
|
||||||
|
|
||||||
|
t0 = _mm512_add_pd(v0, _mm512_permutex_pd(v0, 0x4e));
|
||||||
|
t0 = _mm512_mask_add_pd(t0, simd_mask_from_u32(0xccul), v1, _mm512_permutex_pd(v1, 0x4e));
|
||||||
|
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1));
|
||||||
|
t0 = _mm512_mask_shuffle_f64x2(t0, simd_mask_from_u32(0xaaul), t0, t0, 0xee);
|
||||||
|
t2 = _mm512_castpd512_pd256(t0);
|
||||||
|
t3 = _mm256_load_pd(m);
|
||||||
|
t3 = _mm256_add_pd(t3, t2);
|
||||||
|
_mm256_store_pd(m, t3);
|
||||||
|
|
||||||
|
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0x4e));
|
||||||
|
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1));
|
||||||
|
return _mm_cvtsd_f64(_mm512_castpd512_pd128(t0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#else // Single-precision
|
||||||
|
|
||||||
|
#define MD_SIMD_FLOAT __m512
|
||||||
|
#define MD_SIMD_MASK __mmask16
|
||||||
|
|
||||||
|
static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm512_set1_ps(scalar); }
|
||||||
|
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_ps(0.0f); }
|
||||||
|
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_ps(a, b); }
|
||||||
|
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_sub_ps(a, b); }
|
||||||
|
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_mul_ps(a, b); }
|
||||||
|
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm512_fmadd_ps(a, b, c); }
|
||||||
|
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm512_rcp14_ps(a); }
|
||||||
|
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm512_mask_add_ps(a, m, a, b); }
|
||||||
|
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask16(a, b); }
|
||||||
|
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ); }
|
||||||
|
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask16(a); }
|
||||||
|
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask16_u32(a); }
|
||||||
|
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_ps(p); }
|
||||||
|
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
|
||||||
|
// This would only be called in a Mx16 configuration, which is not valid in GROMACS
|
||||||
|
fprintf(stderr, "simd_h_reduce_sum(): Called with AVX512 intrinsics and single-precision which is not valid!\n");
|
||||||
|
exit(-1);
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const float* m) {
|
||||||
|
return _mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_load_pd((const double *)(m))));
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline MD_SIMD_FLOAT simd_load_h_dual(const float* m) {
|
||||||
|
return _mm512_shuffle_f32x4(_mm512_broadcastss_ps(_mm_load_ss(m)), _mm512_broadcastss_ps(_mm_load_ss(m + 1)), 0x44);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline MD_FLOAT simd_h_dual_reduce_sum(float* m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
|
||||||
|
__m512 t0, t1;
|
||||||
|
__m128 t2, t3;
|
||||||
|
|
||||||
|
t0 = _mm512_shuffle_f32x4(v0, v1, 0x88);
|
||||||
|
t1 = _mm512_shuffle_f32x4(v0, v1, 0xdd);
|
||||||
|
t0 = _mm512_add_ps(t0, t1);
|
||||||
|
t0 = _mm512_add_ps(t0, _mm512_permute_ps(t0, 0x4e));
|
||||||
|
t0 = _mm512_add_ps(t0, _mm512_permute_ps(t0, 0xb1));
|
||||||
|
t0 = _mm512_maskz_compress_ps(simd_mask_from_u32(0x1111ul), t0);
|
||||||
|
t3 = _mm512_castps512_ps128(t0);
|
||||||
|
t2 = _mm_load_ps(m);
|
||||||
|
t2 = _mm_add_ps(t2, t3);
|
||||||
|
_mm_store_ps(m, t2);
|
||||||
|
|
||||||
|
t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0x4e));
|
||||||
|
t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0xb1));
|
||||||
|
return _mm_cvtss_f32(t3);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // PRECISION
|
||||||
|
|
||||||
#else // AVX or AVX2
|
#else // AVX or AVX2
|
||||||
|
|
||||||
|
#if PRECISION == 2 // Double precision
|
||||||
|
|
||||||
#define MD_SIMD_FLOAT __m256d
|
#define MD_SIMD_FLOAT __m256d
|
||||||
|
|
||||||
#ifdef NO_AVX2
|
#ifdef NO_AVX2
|
||||||
@ -72,6 +158,7 @@ static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return
|
|||||||
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_pd(p); }
|
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_pd(p); }
|
||||||
|
|
||||||
#ifdef NO_AVX2
|
#ifdef NO_AVX2
|
||||||
|
|
||||||
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(a))); }
|
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(a))); }
|
||||||
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return simd_add(simd_mul(a, b), c); }
|
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return simd_add(simd_mul(a, b), c); }
|
||||||
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return simd_add(a, _mm256_and_pd(b, m)); }
|
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return simd_add(a, _mm256_and_pd(b, m)); }
|
||||||
@ -85,7 +172,7 @@ static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) {
|
|||||||
}
|
}
|
||||||
// TODO: Implement this, althrough it is just required for debugging
|
// TODO: Implement this, althrough it is just required for debugging
|
||||||
static inline int simd_mask_to_u32(MD_SIMD_MASK a) { return 0; }
|
static inline int simd_mask_to_u32(MD_SIMD_MASK a) { return 0; }
|
||||||
static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
|
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
|
||||||
__m128d a0, a1;
|
__m128d a0, a1;
|
||||||
a = _mm256_add_pd(a, _mm256_permute_pd(a, 0b0101));
|
a = _mm256_add_pd(a, _mm256_permute_pd(a, 0b0101));
|
||||||
a0 = _mm256_castpd256_pd128(a);
|
a0 = _mm256_castpd256_pd128(a);
|
||||||
@ -93,7 +180,9 @@ static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
|
|||||||
a0 = _mm_add_sd(a0, a1);
|
a0 = _mm_add_sd(a0, a1);
|
||||||
return *((MD_FLOAT *) &a0);
|
return *((MD_FLOAT *) &a0);
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
|
#else // AVX2
|
||||||
|
|
||||||
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_pd(a); }
|
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_pd(a); }
|
||||||
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_pd(a, b, c); }
|
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_pd(a, b, c); }
|
||||||
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_pd(a, m, a, b); }
|
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_pd(a, m, a, b); }
|
||||||
@ -101,7 +190,7 @@ static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) {
|
|||||||
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); }
|
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); }
|
||||||
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
|
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
|
||||||
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
|
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
|
||||||
static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
|
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
|
||||||
__m128d a0, a1;
|
__m128d a0, a1;
|
||||||
// test with shuffle & add as an alternative to hadd later
|
// test with shuffle & add as an alternative to hadd later
|
||||||
a = _mm256_hadd_pd(a, a);
|
a = _mm256_hadd_pd(a, a);
|
||||||
@ -110,11 +199,53 @@ static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
|
|||||||
a0 = _mm_add_sd(a0, a1);
|
a0 = _mm_add_sd(a0, a1);
|
||||||
return *((MD_FLOAT *) &a0);
|
return *((MD_FLOAT *) &a0);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#else // Single-precision
|
||||||
|
|
||||||
|
#define MD_SIMD_FLOAT __m256
|
||||||
|
|
||||||
|
#ifdef NO_AVX2
|
||||||
|
#define MD_SIMD_MASK __m256
|
||||||
|
#else
|
||||||
|
#define MD_SIMD_MASK __mmask8
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm256_set1_ps(scalar); }
|
||||||
|
static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_ps(0.0); }
|
||||||
|
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_ps(a, b); }
|
||||||
|
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_ps(a, b); }
|
||||||
|
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_ps(a, b); }
|
||||||
|
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_ps(p); }
|
||||||
|
|
||||||
|
#ifdef NO_AVX2
|
||||||
|
|
||||||
|
#error "AVX intrinsincs with single-precision not implemented!"
|
||||||
|
|
||||||
|
#else // AVX2
|
||||||
|
|
||||||
|
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_ps(a); }
|
||||||
|
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_ps(a, b, c); }
|
||||||
|
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_ps(a, m, a, b); }
|
||||||
|
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_pd_mask(a, b, _CMP_LT_OQ); }
|
||||||
|
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); }
|
||||||
|
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
|
||||||
|
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
|
||||||
|
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
|
||||||
|
__m128 t0;
|
||||||
|
t0 = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 0x1));
|
||||||
|
t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
|
||||||
|
t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
|
||||||
|
return *((MD_FLOAT *) &t0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // NO_AVX2
|
||||||
|
|
||||||
|
#endif // PRECISION
|
||||||
|
|
||||||
|
#endif // AVX or AVX2
|
||||||
|
|
||||||
static inline void simd_print_real(const char *ref, MD_SIMD_FLOAT a) {
|
static inline void simd_print_real(const char *ref, MD_SIMD_FLOAT a) {
|
||||||
double x[VECTOR_WIDTH];
|
double x[VECTOR_WIDTH];
|
||||||
memcpy(x, &a, sizeof(x));
|
memcpy(x, &a, sizeof(x));
|
||||||
|
@ -1,15 +1,18 @@
|
|||||||
ifeq ($(strip $(ISA)), SSE)
|
ifeq ($(strip $(ISA)), SSE)
|
||||||
VECTOR_WIDTH=2
|
_VECTOR_WIDTH=2
|
||||||
else ifeq ($(strip $(ISA)), AVX)
|
else ifeq ($(strip $(ISA)), AVX)
|
||||||
# Vector width is 4 but AVX2 instruction set is not supported
|
# Vector width is 4 but AVX2 instruction set is not supported
|
||||||
NO_AVX2=true
|
NO_AVX2=true
|
||||||
VECTOR_WIDTH=4
|
_VECTOR_WIDTH=4
|
||||||
else ifeq ($(strip $(ISA)), AVX2)
|
else ifeq ($(strip $(ISA)), AVX2)
|
||||||
VECTOR_WIDTH=4
|
_VECTOR_WIDTH=4
|
||||||
else ifeq ($(strip $(ISA)), AVX512)
|
else ifeq ($(strip $(ISA)), AVX512)
|
||||||
VECTOR_WIDTH=8
|
AVX512=true
|
||||||
|
_VECTOR_WIDTH=8
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(strip $(DATA_TYPE)), SP)
|
ifeq ($(strip $(DATA_TYPE)), SP)
|
||||||
VECTOR_WIDTH=$((VECTOR_WIDTH * 2))
|
VECTOR_WIDTH=$(shell echo $$(( $(_VECTOR_WIDTH) * 2 )))
|
||||||
|
else
|
||||||
|
VECTOR_WIDTH=$(_VECTOR_WIDTH)
|
||||||
endif
|
endif
|
||||||
|
@ -217,8 +217,6 @@ int main(int argc, const char *argv[]) {
|
|||||||
|
|
||||||
if(!csv) {
|
if(!csv) {
|
||||||
printf("Number of timesteps: %d\n", param.ntimes);
|
printf("Number of timesteps: %d\n", param.ntimes);
|
||||||
printf("Number of times to compute the atoms loop: %d\n", ATOMS_LOOP_RUNS);
|
|
||||||
printf("Number of times to compute the neighbors loop: %d\n", NEIGHBORS_LOOP_RUNS);
|
|
||||||
printf("System size (unit cells): %dx%dx%d\n", param.nx, param.ny, param.nz);
|
printf("System size (unit cells): %dx%dx%d\n", param.nx, param.ny, param.nz);
|
||||||
printf("Atoms per unit cell: %d\n", atoms_per_unit_cell);
|
printf("Atoms per unit cell: %d\n", atoms_per_unit_cell);
|
||||||
printf("Total number of atoms: %d\n", atom->Nlocal);
|
printf("Total number of atoms: %d\n", atom->Nlocal);
|
||||||
@ -257,9 +255,8 @@ int main(int argc, const char *argv[]) {
|
|||||||
E = getTimeStamp();
|
E = getTimeStamp();
|
||||||
double T_accum = E-S;
|
double T_accum = E-S;
|
||||||
double freq_hz = param.proc_freq * 1.e9;
|
double freq_hz = param.proc_freq * 1.e9;
|
||||||
const double repeats = ATOMS_LOOP_RUNS * NEIGHBORS_LOOP_RUNS;
|
const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes);
|
||||||
const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes * repeats);
|
const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes) * freq_hz;
|
||||||
const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes * repeats) * freq_hz;
|
|
||||||
const double cycles_per_neigh = cycles_per_atom / (double)(atoms_per_unit_cell - 1);
|
const double cycles_per_neigh = cycles_per_atom / (double)(atoms_per_unit_cell - 1);
|
||||||
|
|
||||||
if(!csv) {
|
if(!csv) {
|
||||||
|
Loading…
Reference in New Issue
Block a user