Add first compilable version of Gromacs with SP

Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
This commit is contained in:
Rafael Ravedutti 2022-03-15 02:40:56 +01:00
parent 8669f2f6d7
commit d61576699d
6 changed files with 216 additions and 144 deletions

View File

@ -17,9 +17,6 @@ INCLUDES += -I./$(SRC_DIR)/includes
ifeq ($(strip $(DATA_LAYOUT)),AOS)
DEFINES += -DAOS
endif
ifeq ($(strip $(CLUSTER_LAYOUT)),AOS)
DEFINES += -DCLUSTER_AOS
endif
ifeq ($(strip $(DATA_TYPE)),SP)
DEFINES += -DPRECISION=1
else
@ -30,14 +27,6 @@ ifneq ($(ASM_SYNTAX), ATT)
ASFLAGS += -masm=intel
endif
ifneq ($(ATOMS_LOOP_RUNS),)
DEFINES += -DATOMS_LOOP_RUNS=$(ATOMS_LOOP_RUNS)
endif
ifneq ($(NEIGHBORS_LOOP_RUNS),)
DEFINES += -DNEIGHBORS_LOOP_RUNS=$(NEIGHBORS_LOOP_RUNS)
endif
ifeq ($(strip $(EXPLICIT_TYPES)),true)
DEFINES += -DEXPLICIT_TYPES
endif
@ -74,6 +63,10 @@ ifeq ($(strip $(NO_AVX2)),true)
DEFINES += -DNO_AVX2
endif
ifeq ($(strip $(AVX512)),true)
DEFINES += -DAVX512
endif
VPATH = $(SRC_DIR) $(ASM_DIR)
ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s))

View File

@ -7,7 +7,7 @@ OPT_SCHEME ?= gromacs
# Enable likwid (true or false)
ENABLE_LIKWID ?= true
# SP or DP
DATA_TYPE ?= DP
DATA_TYPE ?= SP
# AOS or SOA
DATA_LAYOUT ?= AOS
# Assembly syntax to generate (ATT/INTEL)
@ -15,10 +15,6 @@ ASM_SYNTAX ?= ATT
# Debug
DEBUG ?= false
# Number of times to run the atoms loop on stubbed variant
ATOMS_LOOP_RUNS ?= 1
# Number of times to run the neighbors loop on stubbed variant
NEIGHBORS_LOOP_RUNS ?= 1
# Explicitly store and load atom types (true or false)
EXPLICIT_TYPES ?= false
# Trace memory addresses for cache simulator (true or false)
@ -29,8 +25,6 @@ INDEX_TRACER ?= false
COMPUTE_STATS ?= true
# Configurations for gromacs optimization scheme
# AOS or SOA
CLUSTER_LAYOUT ?= SOA
# Use reference version
USE_REFERENCE_VERSION ?= false
# Enable XTC output

View File

@ -128,7 +128,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
}
double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
int Nlocal = atom->Nlocal;
int* neighs;
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
@ -136,10 +136,10 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta
MD_FLOAT epsilon = param->epsilon;
MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
MD_SIMD_FLOAT epsilon_vec = simd_broadcast(epsilon);
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
const int unroll_j = VECTOR_WIDTH / CLUSTER_N;
const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
double S = getTimeStamp();
LIKWID_MARKER_START("force");
@ -152,131 +152,86 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
int numneighs = neighbor->numneigh[ci];
MD_SIMD_FLOAT xi0_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 0]);
MD_SIMD_FLOAT xi1_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 1]);
MD_SIMD_FLOAT xi2_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 2]);
MD_SIMD_FLOAT xi3_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 3]);
MD_SIMD_FLOAT yi0_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 0]);
MD_SIMD_FLOAT yi1_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 1]);
MD_SIMD_FLOAT yi2_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 2]);
MD_SIMD_FLOAT yi3_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 3]);
MD_SIMD_FLOAT zi0_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 0]);
MD_SIMD_FLOAT zi1_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 1]);
MD_SIMD_FLOAT zi2_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 2]);
MD_SIMD_FLOAT zi3_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 3]);
MD_SIMD_FLOAT xi0_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 0]);
MD_SIMD_FLOAT xi2_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 2]);
MD_SIMD_FLOAT yi0_tmp = simd_load_h_dual(&ci_x[CL_Y_OFFSET + 0]);
MD_SIMD_FLOAT yi2_tmp = simd_load_h_dual(&ci_x[CL_Y_OFFSET + 2]);
MD_SIMD_FLOAT zi0_tmp = simd_load_h_dual(&ci_x[CL_Z_OFFSET + 0]);
MD_SIMD_FLOAT zi2_tmp = simd_load_h_dual(&ci_x[CL_Z_OFFSET + 2]);
MD_SIMD_FLOAT fix0 = simd_zero();
MD_SIMD_FLOAT fiy0 = simd_zero();
MD_SIMD_FLOAT fiz0 = simd_zero();
MD_SIMD_FLOAT fix1 = simd_zero();
MD_SIMD_FLOAT fiy1 = simd_zero();
MD_SIMD_FLOAT fiz1 = simd_zero();
MD_SIMD_FLOAT fix2 = simd_zero();
MD_SIMD_FLOAT fiy2 = simd_zero();
MD_SIMD_FLOAT fiz2 = simd_zero();
MD_SIMD_FLOAT fix3 = simd_zero();
MD_SIMD_FLOAT fiy3 = simd_zero();
MD_SIMD_FLOAT fiz3 = simd_zero();
for(int k = 0; k < numneighs; k += unroll_j) {
int cj = neighs[k + 0];
for(int k = 0; k < numneighs; k += 2) {
int cj = neighs[k];
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
unsigned int cond0 = (unsigned int)(ci == cj);
unsigned int cond1 = cond0;
MD_SIMD_FLOAT xj_tmp, yj_tmp, zj_tmp;
/*
MD_FLOAT *cj0_ptr = cluster_pos_ptr(cj0);
#if VECTOR_WIDTH == 8
int cj1 = neighs[k + 1];
unsigned int cond1 = (unsigned int)(ci == cj1);
MD_FLOAT *cj1_ptr = cluster_pos_ptr(cj1);
MD_SIMD_FLOAT xj_tmp = simd_load2(cj0_ptr, cj1_ptr, 0);
MD_SIMD_FLOAT yj_tmp = simd_load2(cj0_ptr, cj1_ptr, 1);
MD_SIMD_FLOAT zj_tmp = simd_load2(cj0_ptr, cj1_ptr, 2);
#else
MD_SIMD_FLOAT xj_tmp = simd_load(cj0_ptr, 0);
MD_SIMD_FLOAT yj_tmp = simd_load(cj0_ptr, 1);
MD_SIMD_FLOAT zj_tmp = simd_load(cj0_ptr, 2);
#endif
*/
unsigned int mask0, mask1, mask2, mask3;
MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
MD_SIMD_FLOAT delx1 = simd_sub(xi1_tmp, xj_tmp);
MD_SIMD_FLOAT dely1 = simd_sub(yi1_tmp, yj_tmp);
MD_SIMD_FLOAT delz1 = simd_sub(zi1_tmp, zj_tmp);
MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
MD_SIMD_FLOAT delx3 = simd_sub(xi3_tmp, xj_tmp);
MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp);
MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp);
#if VECTOR_WIDTH == 8
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1));
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1));
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1));
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1));
#if CLUSTER_M == CLUSTER_N
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
mask0 = (unsigned int)(0xf - 0x1 * cond0);
mask1 = (unsigned int)(0xf - 0x2 * cond0);
mask2 = (unsigned int)(0xf - 0x4 * cond0);
mask3 = (unsigned int)(0xf - 0x8 * cond0);
#elif CLUSTER_M < CLUSTER_N
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
mask1 = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
mask2 = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
mask3 = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
#else
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0));
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0));
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0));
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
mask1 = (unsigned int)(0x3 - 0x2 * cond0);
mask2 = (unsigned int)(0x3 - 0x1 * cond1);
mask3 = (unsigned int)(0x3 - 0x2 * cond1);
#endif
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0);
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2);
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, simd_mul(delz1, delz1)));
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, simd_mul(delz3, delz3)));
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
MD_SIMD_MASK cutoff_mask1 = simd_mask_and(excl_mask1, simd_mask_cond_lt(rsq1, cutforcesq_vec));
MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
MD_SIMD_MASK cutoff_mask3 = simd_mask_and(excl_mask3, simd_mask_cond_lt(rsq3, cutforcesq_vec));
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
MD_SIMD_FLOAT sr2_1 = simd_reciprocal(rsq1);
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
MD_SIMD_FLOAT sr6_1 = simd_mul(sr2_1, simd_mul(sr2_1, simd_mul(sr2_1, sigma6_vec)));
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec)));
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, epsilon_vec))));
MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, epsilon_vec))));
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, epsilon_vec))));
MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, epsilon_vec))));
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
fiz0 = simd_masked_add(fiz0, simd_mul(delz0, force0), cutoff_mask0);
fix1 = simd_masked_add(fix1, simd_mul(delx1, force1), cutoff_mask1);
fiy1 = simd_masked_add(fiy1, simd_mul(dely1, force1), cutoff_mask1);
fiz1 = simd_masked_add(fiz1, simd_mul(delz1, force1), cutoff_mask1);
fix2 = simd_masked_add(fix2, simd_mul(delx2, force2), cutoff_mask2);
fiy2 = simd_masked_add(fiy2, simd_mul(dely2, force2), cutoff_mask2);
fiz2 = simd_masked_add(fiz2, simd_mul(delz2, force2), cutoff_mask2);
fix3 = simd_masked_add(fix3, simd_mul(delx3, force3), cutoff_mask3);
fiy3 = simd_masked_add(fiy3, simd_mul(dely3, force3), cutoff_mask3);
fiz3 = simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3);
}
ci_f[CL_X_OFFSET + 0] = simd_horizontal_sum(fix0);
ci_f[CL_X_OFFSET + 1] = simd_horizontal_sum(fix1);
ci_f[CL_X_OFFSET + 2] = simd_horizontal_sum(fix2);
ci_f[CL_X_OFFSET + 3] = simd_horizontal_sum(fix3);
ci_f[CL_Y_OFFSET + 0] = simd_horizontal_sum(fiy0);
ci_f[CL_Y_OFFSET + 1] = simd_horizontal_sum(fiy1);
ci_f[CL_Y_OFFSET + 2] = simd_horizontal_sum(fiy2);
ci_f[CL_Y_OFFSET + 3] = simd_horizontal_sum(fiy3);
ci_f[CL_Z_OFFSET + 0] = simd_horizontal_sum(fiz0);
ci_f[CL_Z_OFFSET + 1] = simd_horizontal_sum(fiz1);
ci_f[CL_Z_OFFSET + 2] = simd_horizontal_sum(fiz2);
ci_f[CL_Z_OFFSET + 3] = simd_horizontal_sum(fiz3);
simd_h_dual_reduce_sum(&ci_f[CL_X_OFFSET], fix0, fix2);
simd_h_dual_reduce_sum(&ci_f[CL_Y_OFFSET], fiy0, fiy2);
simd_h_dual_reduce_sum(&ci_f[CL_Z_OFFSET], fiz0, fiz2);
addStat(stats->calculated_forces, 1);
addStat(stats->num_neighs, numneighs);
@ -285,7 +240,7 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta
LIKWID_MARKER_STOP("force");
double E = getTimeStamp();
DEBUG_MESSAGE("computeForceLJ_4xn end\n");
DEBUG_MESSAGE("computeForceLJ_2xnn end\n");
return E-S;
}
@ -298,7 +253,7 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
MD_FLOAT epsilon = param->epsilon;
MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
MD_SIMD_FLOAT epsilon_vec = simd_broadcast(epsilon);
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
double S = getTimeStamp();
@ -348,7 +303,6 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
@ -363,14 +317,14 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp);
#if CLUSTER_M == CLUSTER_N
int cond0 = (unsigned int)(cj == ci_cj0);
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0));
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0));
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0));
#elif CLUSTER_M < CLUSTER_N
int cond0 = (unsigned int)((cj << 1) + 0 == ci);
int cond1 = (unsigned int)((cj << 1) + 1 == ci);
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1));
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1));
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1));
@ -404,10 +358,10 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec)));
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, epsilon_vec))));
MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, epsilon_vec))));
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, epsilon_vec))));
MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, epsilon_vec))));
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, eps_vec))));
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, eps_vec))));
fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
@ -423,18 +377,18 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
fiz3 = simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3);
}
ci_f[CL_X_OFFSET + 0] = simd_horizontal_sum(fix0);
ci_f[CL_X_OFFSET + 1] = simd_horizontal_sum(fix1);
ci_f[CL_X_OFFSET + 2] = simd_horizontal_sum(fix2);
ci_f[CL_X_OFFSET + 3] = simd_horizontal_sum(fix3);
ci_f[CL_Y_OFFSET + 0] = simd_horizontal_sum(fiy0);
ci_f[CL_Y_OFFSET + 1] = simd_horizontal_sum(fiy1);
ci_f[CL_Y_OFFSET + 2] = simd_horizontal_sum(fiy2);
ci_f[CL_Y_OFFSET + 3] = simd_horizontal_sum(fiy3);
ci_f[CL_Z_OFFSET + 0] = simd_horizontal_sum(fiz0);
ci_f[CL_Z_OFFSET + 1] = simd_horizontal_sum(fiz1);
ci_f[CL_Z_OFFSET + 2] = simd_horizontal_sum(fiz2);
ci_f[CL_Z_OFFSET + 3] = simd_horizontal_sum(fiz3);
ci_f[CL_X_OFFSET + 0] = simd_h_reduce_sum(fix0);
ci_f[CL_X_OFFSET + 1] = simd_h_reduce_sum(fix1);
ci_f[CL_X_OFFSET + 2] = simd_h_reduce_sum(fix2);
ci_f[CL_X_OFFSET + 3] = simd_h_reduce_sum(fix3);
ci_f[CL_Y_OFFSET + 0] = simd_h_reduce_sum(fiy0);
ci_f[CL_Y_OFFSET + 1] = simd_h_reduce_sum(fiy1);
ci_f[CL_Y_OFFSET + 2] = simd_h_reduce_sum(fiy2);
ci_f[CL_Y_OFFSET + 3] = simd_h_reduce_sum(fiy3);
ci_f[CL_Z_OFFSET + 0] = simd_h_reduce_sum(fiz0);
ci_f[CL_Z_OFFSET + 1] = simd_h_reduce_sum(fiz1);
ci_f[CL_Z_OFFSET + 2] = simd_h_reduce_sum(fiz2);
ci_f[CL_Z_OFFSET + 3] = simd_h_reduce_sum(fiz3);
addStat(stats->calculated_forces, 1);
addStat(stats->num_neighs, numneighs);

View File

@ -21,6 +21,7 @@
* =======================================================================================
*/
#include <stdlib.h>
#include <string.h>
#include <immintrin.h>
#include <zmmintrin.h>
@ -28,7 +29,9 @@
#define SIMD_PRINT_REAL(a) simd_print_real(#a, a);
#define SIMD_PRINT_MASK(a) simd_print_mask(#a, a);
#if VECTOR_WIDTH == 8 // AVX512
#ifdef AVX512
#if PRECISION == 2 // Double precision
#define MD_SIMD_FLOAT __m512d
#define MD_SIMD_MASK __mmask8
@ -47,15 +50,98 @@ static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_m
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_pd(p); }
static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
MD_SIMD_FLOAT x = _mm512_add_pd(a, _mm512_shuffle_f64x2(a, a, 0xee));
x = _mm512_add_pd(x, _mm512_shuffle_f64x2(x, x, 0x11));
x = _mm512_add_pd(x, _mm512_permute_pd(x, 0x01));
return *((MD_FLOAT *) &x);
}
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const double* m) {
return _mm512_broadcast_f64x4(_mm256_load_pd(m));
}
static inline MD_SIMD_FLOAT simd_load_h_dual(const double* m) {
return _mm512_insertf64x4(_mm512_broadcastsd_pd(_mm_load_sd(m)), _mm256_broadcastsd_pd(_mm_load_sd(m + 1)), 1);
}
static inline double simd_h_dual_reduce_sum(double* m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
__m512d t0;
__m256d t2, t3;
t0 = _mm512_add_pd(v0, _mm512_permutex_pd(v0, 0x4e));
t0 = _mm512_mask_add_pd(t0, simd_mask_from_u32(0xccul), v1, _mm512_permutex_pd(v1, 0x4e));
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1));
t0 = _mm512_mask_shuffle_f64x2(t0, simd_mask_from_u32(0xaaul), t0, t0, 0xee);
t2 = _mm512_castpd512_pd256(t0);
t3 = _mm256_load_pd(m);
t3 = _mm256_add_pd(t3, t2);
_mm256_store_pd(m, t3);
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0x4e));
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1));
return _mm_cvtsd_f64(_mm512_castpd512_pd128(t0));
}
#else // Single-precision
#define MD_SIMD_FLOAT __m512
#define MD_SIMD_MASK __mmask16
static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm512_set1_ps(scalar); }
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_ps(0.0f); }
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_ps(a, b); }
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_sub_ps(a, b); }
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_mul_ps(a, b); }
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm512_fmadd_ps(a, b, c); }
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm512_rcp14_ps(a); }
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm512_mask_add_ps(a, m, a, b); }
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask16(a, b); }
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ); }
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask16(a); }
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask16_u32(a); }
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_ps(p); }
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
// This would only be called in a Mx16 configuration, which is not valid in GROMACS
fprintf(stderr, "simd_h_reduce_sum(): Called with AVX512 intrinsics and single-precision which is not valid!\n");
exit(-1);
return 0.0;
}
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const float* m) {
return _mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_load_pd((const double *)(m))));
}
static inline MD_SIMD_FLOAT simd_load_h_dual(const float* m) {
return _mm512_shuffle_f32x4(_mm512_broadcastss_ps(_mm_load_ss(m)), _mm512_broadcastss_ps(_mm_load_ss(m + 1)), 0x44);
}
static inline MD_FLOAT simd_h_dual_reduce_sum(float* m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
__m512 t0, t1;
__m128 t2, t3;
t0 = _mm512_shuffle_f32x4(v0, v1, 0x88);
t1 = _mm512_shuffle_f32x4(v0, v1, 0xdd);
t0 = _mm512_add_ps(t0, t1);
t0 = _mm512_add_ps(t0, _mm512_permute_ps(t0, 0x4e));
t0 = _mm512_add_ps(t0, _mm512_permute_ps(t0, 0xb1));
t0 = _mm512_maskz_compress_ps(simd_mask_from_u32(0x1111ul), t0);
t3 = _mm512_castps512_ps128(t0);
t2 = _mm_load_ps(m);
t2 = _mm_add_ps(t2, t3);
_mm_store_ps(m, t2);
t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0x4e));
t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0xb1));
return _mm_cvtss_f32(t3);
}
#endif // PRECISION
#else // AVX or AVX2
#if PRECISION == 2 // Double precision
#define MD_SIMD_FLOAT __m256d
#ifdef NO_AVX2
@ -72,6 +158,7 @@ static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_pd(p); }
#ifdef NO_AVX2
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(a))); }
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return simd_add(simd_mul(a, b), c); }
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return simd_add(a, _mm256_and_pd(b, m)); }
@ -85,7 +172,7 @@ static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) {
}
// TODO: Implement this, althrough it is just required for debugging
static inline int simd_mask_to_u32(MD_SIMD_MASK a) { return 0; }
static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
__m128d a0, a1;
a = _mm256_add_pd(a, _mm256_permute_pd(a, 0b0101));
a0 = _mm256_castpd256_pd128(a);
@ -93,7 +180,9 @@ static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
a0 = _mm_add_sd(a0, a1);
return *((MD_FLOAT *) &a0);
}
#else
#else // AVX2
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_pd(a); }
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_pd(a, b, c); }
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_pd(a, m, a, b); }
@ -101,7 +190,7 @@ static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) {
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); }
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
__m128d a0, a1;
// test with shuffle & add as an alternative to hadd later
a = _mm256_hadd_pd(a, a);
@ -110,11 +199,53 @@ static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
a0 = _mm_add_sd(a0, a1);
return *((MD_FLOAT *) &a0);
}
#endif
#endif
#else // Single-precision
#define MD_SIMD_FLOAT __m256
#ifdef NO_AVX2
#define MD_SIMD_MASK __m256
#else
#define MD_SIMD_MASK __mmask8
#endif
static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm256_set1_ps(scalar); }
static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_ps(0.0); }
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_ps(a, b); }
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_ps(a, b); }
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_ps(a, b); }
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_ps(p); }
#ifdef NO_AVX2
#error "AVX intrinsincs with single-precision not implemented!"
#else // AVX2
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_ps(a); }
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_ps(a, b, c); }
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_ps(a, m, a, b); }
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_pd_mask(a, b, _CMP_LT_OQ); }
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); }
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
__m128 t0;
t0 = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 0x1));
t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
return *((MD_FLOAT *) &t0);
}
#endif // NO_AVX2
#endif // PRECISION
#endif // AVX or AVX2
static inline void simd_print_real(const char *ref, MD_SIMD_FLOAT a) {
double x[VECTOR_WIDTH];
memcpy(x, &a, sizeof(x));

View File

@ -1,15 +1,18 @@
ifeq ($(strip $(ISA)), SSE)
VECTOR_WIDTH=2
_VECTOR_WIDTH=2
else ifeq ($(strip $(ISA)), AVX)
# Vector width is 4 but AVX2 instruction set is not supported
NO_AVX2=true
VECTOR_WIDTH=4
_VECTOR_WIDTH=4
else ifeq ($(strip $(ISA)), AVX2)
VECTOR_WIDTH=4
_VECTOR_WIDTH=4
else ifeq ($(strip $(ISA)), AVX512)
VECTOR_WIDTH=8
AVX512=true
_VECTOR_WIDTH=8
endif
ifeq ($(strip $(DATA_TYPE)), SP)
VECTOR_WIDTH=$((VECTOR_WIDTH * 2))
VECTOR_WIDTH=$(shell echo $$(( $(_VECTOR_WIDTH) * 2 )))
else
VECTOR_WIDTH=$(_VECTOR_WIDTH)
endif

View File

@ -217,8 +217,6 @@ int main(int argc, const char *argv[]) {
if(!csv) {
printf("Number of timesteps: %d\n", param.ntimes);
printf("Number of times to compute the atoms loop: %d\n", ATOMS_LOOP_RUNS);
printf("Number of times to compute the neighbors loop: %d\n", NEIGHBORS_LOOP_RUNS);
printf("System size (unit cells): %dx%dx%d\n", param.nx, param.ny, param.nz);
printf("Atoms per unit cell: %d\n", atoms_per_unit_cell);
printf("Total number of atoms: %d\n", atom->Nlocal);
@ -257,9 +255,8 @@ int main(int argc, const char *argv[]) {
E = getTimeStamp();
double T_accum = E-S;
double freq_hz = param.proc_freq * 1.e9;
const double repeats = ATOMS_LOOP_RUNS * NEIGHBORS_LOOP_RUNS;
const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes * repeats);
const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes * repeats) * freq_hz;
const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes);
const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes) * freq_hz;
const double cycles_per_neigh = cycles_per_atom / (double)(atoms_per_unit_cell - 1);
if(!csv) {