diff --git a/Makefile b/Makefile index 78289dc..9d3cc90 100644 --- a/Makefile +++ b/Makefile @@ -17,9 +17,6 @@ INCLUDES += -I./$(SRC_DIR)/includes ifeq ($(strip $(DATA_LAYOUT)),AOS) DEFINES += -DAOS endif -ifeq ($(strip $(CLUSTER_LAYOUT)),AOS) -DEFINES += -DCLUSTER_AOS -endif ifeq ($(strip $(DATA_TYPE)),SP) DEFINES += -DPRECISION=1 else @@ -30,14 +27,6 @@ ifneq ($(ASM_SYNTAX), ATT) ASFLAGS += -masm=intel endif -ifneq ($(ATOMS_LOOP_RUNS),) - DEFINES += -DATOMS_LOOP_RUNS=$(ATOMS_LOOP_RUNS) -endif - -ifneq ($(NEIGHBORS_LOOP_RUNS),) - DEFINES += -DNEIGHBORS_LOOP_RUNS=$(NEIGHBORS_LOOP_RUNS) -endif - ifeq ($(strip $(EXPLICIT_TYPES)),true) DEFINES += -DEXPLICIT_TYPES endif @@ -74,6 +63,10 @@ ifeq ($(strip $(NO_AVX2)),true) DEFINES += -DNO_AVX2 endif +ifeq ($(strip $(AVX512)),true) + DEFINES += -DAVX512 +endif + VPATH = $(SRC_DIR) $(ASM_DIR) ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c)) OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s)) diff --git a/config.mk b/config.mk index de2bf9d..9afe7a0 100644 --- a/config.mk +++ b/config.mk @@ -7,7 +7,7 @@ OPT_SCHEME ?= gromacs # Enable likwid (true or false) ENABLE_LIKWID ?= true # SP or DP -DATA_TYPE ?= DP +DATA_TYPE ?= SP # AOS or SOA DATA_LAYOUT ?= AOS # Assembly syntax to generate (ATT/INTEL) @@ -15,10 +15,6 @@ ASM_SYNTAX ?= ATT # Debug DEBUG ?= false -# Number of times to run the atoms loop on stubbed variant -ATOMS_LOOP_RUNS ?= 1 -# Number of times to run the neighbors loop on stubbed variant -NEIGHBORS_LOOP_RUNS ?= 1 # Explicitly store and load atom types (true or false) EXPLICIT_TYPES ?= false # Trace memory addresses for cache simulator (true or false) @@ -29,8 +25,6 @@ INDEX_TRACER ?= false COMPUTE_STATS ?= true # Configurations for gromacs optimization scheme -# AOS or SOA -CLUSTER_LAYOUT ?= SOA # Use reference version USE_REFERENCE_VERSION ?= false # Enable XTC output diff --git a/gromacs/force_lj.c b/gromacs/force_lj.c index 7b6d37f..41749ad 100644 --- a/gromacs/force_lj.c +++ b/gromacs/force_lj.c @@ -128,7 +128,7 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat } double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { - DEBUG_MESSAGE("computeForceLJ_4xn begin\n"); + DEBUG_MESSAGE("computeForceLJ_2xnn begin\n"); int Nlocal = atom->Nlocal; int* neighs; MD_FLOAT cutforcesq = param->cutforce * param->cutforce; @@ -136,10 +136,10 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta MD_FLOAT epsilon = param->epsilon; MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq); MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6); - MD_SIMD_FLOAT epsilon_vec = simd_broadcast(epsilon); + MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon); MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0); MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5); - const int unroll_j = VECTOR_WIDTH / CLUSTER_N; + const unsigned int half_mask_bits = VECTOR_WIDTH >> 1; double S = getTimeStamp(); LIKWID_MARKER_START("force"); @@ -152,131 +152,86 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta neighs = &neighbor->neighbors[ci * neighbor->maxneighs]; int numneighs = neighbor->numneigh[ci]; - MD_SIMD_FLOAT xi0_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 0]); - MD_SIMD_FLOAT xi1_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 1]); - MD_SIMD_FLOAT xi2_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 2]); - MD_SIMD_FLOAT xi3_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 3]); - MD_SIMD_FLOAT yi0_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 0]); - MD_SIMD_FLOAT yi1_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 1]); - MD_SIMD_FLOAT yi2_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 2]); - MD_SIMD_FLOAT yi3_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 3]); - MD_SIMD_FLOAT zi0_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 0]); - MD_SIMD_FLOAT zi1_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 1]); - MD_SIMD_FLOAT zi2_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 2]); - MD_SIMD_FLOAT zi3_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 3]); + MD_SIMD_FLOAT xi0_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 0]); + MD_SIMD_FLOAT xi2_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 2]); + MD_SIMD_FLOAT yi0_tmp = simd_load_h_dual(&ci_x[CL_Y_OFFSET + 0]); + MD_SIMD_FLOAT yi2_tmp = simd_load_h_dual(&ci_x[CL_Y_OFFSET + 2]); + MD_SIMD_FLOAT zi0_tmp = simd_load_h_dual(&ci_x[CL_Z_OFFSET + 0]); + MD_SIMD_FLOAT zi2_tmp = simd_load_h_dual(&ci_x[CL_Z_OFFSET + 2]); MD_SIMD_FLOAT fix0 = simd_zero(); MD_SIMD_FLOAT fiy0 = simd_zero(); MD_SIMD_FLOAT fiz0 = simd_zero(); - MD_SIMD_FLOAT fix1 = simd_zero(); - MD_SIMD_FLOAT fiy1 = simd_zero(); - MD_SIMD_FLOAT fiz1 = simd_zero(); MD_SIMD_FLOAT fix2 = simd_zero(); MD_SIMD_FLOAT fiy2 = simd_zero(); MD_SIMD_FLOAT fiz2 = simd_zero(); - MD_SIMD_FLOAT fix3 = simd_zero(); - MD_SIMD_FLOAT fiy3 = simd_zero(); - MD_SIMD_FLOAT fiz3 = simd_zero(); - for(int k = 0; k < numneighs; k += unroll_j) { - int cj = neighs[k + 0]; + for(int k = 0; k < numneighs; k += 2) { + int cj = neighs[k]; int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; - unsigned int cond0 = (unsigned int)(ci == cj); - unsigned int cond1 = cond0; - MD_SIMD_FLOAT xj_tmp, yj_tmp, zj_tmp; - - /* - MD_FLOAT *cj0_ptr = cluster_pos_ptr(cj0); - #if VECTOR_WIDTH == 8 - int cj1 = neighs[k + 1]; - unsigned int cond1 = (unsigned int)(ci == cj1); - MD_FLOAT *cj1_ptr = cluster_pos_ptr(cj1); - MD_SIMD_FLOAT xj_tmp = simd_load2(cj0_ptr, cj1_ptr, 0); - MD_SIMD_FLOAT yj_tmp = simd_load2(cj0_ptr, cj1_ptr, 1); - MD_SIMD_FLOAT zj_tmp = simd_load2(cj0_ptr, cj1_ptr, 2); - #else - MD_SIMD_FLOAT xj_tmp = simd_load(cj0_ptr, 0); - MD_SIMD_FLOAT yj_tmp = simd_load(cj0_ptr, 1); - MD_SIMD_FLOAT zj_tmp = simd_load(cj0_ptr, 2); - #endif - */ + unsigned int mask0, mask1, mask2, mask3; + MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]); + MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]); + MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]); MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp); MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp); MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp); - MD_SIMD_FLOAT delx1 = simd_sub(xi1_tmp, xj_tmp); - MD_SIMD_FLOAT dely1 = simd_sub(yi1_tmp, yj_tmp); - MD_SIMD_FLOAT delz1 = simd_sub(zi1_tmp, zj_tmp); MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp); MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp); MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp); - MD_SIMD_FLOAT delx3 = simd_sub(xi3_tmp, xj_tmp); - MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp); - MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp); - #if VECTOR_WIDTH == 8 - MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1)); - MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1)); - MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1)); - MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1)); + #if CLUSTER_M == CLUSTER_N + unsigned int cond0 = (unsigned int)(cj == ci_cj0); + mask0 = (unsigned int)(0xf - 0x1 * cond0); + mask1 = (unsigned int)(0xf - 0x2 * cond0); + mask2 = (unsigned int)(0xf - 0x4 * cond0); + mask3 = (unsigned int)(0xf - 0x8 * cond0); + #elif CLUSTER_M < CLUSTER_N + unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci); + unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci); + mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1); + mask1 = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1); + mask2 = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1); + mask3 = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1); #else - MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0)); - MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0)); - MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0)); - MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0)); + unsigned int cond0 = (unsigned int)(cj == ci_cj0); + unsigned int cond1 = (unsigned int)(cj == ci_cj1); + mask0 = (unsigned int)(0x3 - 0x1 * cond0); + mask1 = (unsigned int)(0x3 - 0x2 * cond0); + mask2 = (unsigned int)(0x3 - 0x1 * cond1); + mask3 = (unsigned int)(0x3 - 0x2 * cond1); #endif + MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0); + MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2); + MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0))); - MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, simd_mul(delz1, delz1))); MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2))); - MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, simd_mul(delz3, delz3))); MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec)); - MD_SIMD_MASK cutoff_mask1 = simd_mask_and(excl_mask1, simd_mask_cond_lt(rsq1, cutforcesq_vec)); MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec)); - MD_SIMD_MASK cutoff_mask3 = simd_mask_and(excl_mask3, simd_mask_cond_lt(rsq3, cutforcesq_vec)); MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0); - MD_SIMD_FLOAT sr2_1 = simd_reciprocal(rsq1); MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2); - MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3); MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec))); - MD_SIMD_FLOAT sr6_1 = simd_mul(sr2_1, simd_mul(sr2_1, simd_mul(sr2_1, sigma6_vec))); MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec))); - MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec))); - MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, epsilon_vec)))); - MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, epsilon_vec)))); - MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, epsilon_vec)))); - MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, epsilon_vec)))); + MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec)))); + MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec)))); fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0); fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0); fiz0 = simd_masked_add(fiz0, simd_mul(delz0, force0), cutoff_mask0); - fix1 = simd_masked_add(fix1, simd_mul(delx1, force1), cutoff_mask1); - fiy1 = simd_masked_add(fiy1, simd_mul(dely1, force1), cutoff_mask1); - fiz1 = simd_masked_add(fiz1, simd_mul(delz1, force1), cutoff_mask1); fix2 = simd_masked_add(fix2, simd_mul(delx2, force2), cutoff_mask2); fiy2 = simd_masked_add(fiy2, simd_mul(dely2, force2), cutoff_mask2); fiz2 = simd_masked_add(fiz2, simd_mul(delz2, force2), cutoff_mask2); - fix3 = simd_masked_add(fix3, simd_mul(delx3, force3), cutoff_mask3); - fiy3 = simd_masked_add(fiy3, simd_mul(dely3, force3), cutoff_mask3); - fiz3 = simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3); } - ci_f[CL_X_OFFSET + 0] = simd_horizontal_sum(fix0); - ci_f[CL_X_OFFSET + 1] = simd_horizontal_sum(fix1); - ci_f[CL_X_OFFSET + 2] = simd_horizontal_sum(fix2); - ci_f[CL_X_OFFSET + 3] = simd_horizontal_sum(fix3); - ci_f[CL_Y_OFFSET + 0] = simd_horizontal_sum(fiy0); - ci_f[CL_Y_OFFSET + 1] = simd_horizontal_sum(fiy1); - ci_f[CL_Y_OFFSET + 2] = simd_horizontal_sum(fiy2); - ci_f[CL_Y_OFFSET + 3] = simd_horizontal_sum(fiy3); - ci_f[CL_Z_OFFSET + 0] = simd_horizontal_sum(fiz0); - ci_f[CL_Z_OFFSET + 1] = simd_horizontal_sum(fiz1); - ci_f[CL_Z_OFFSET + 2] = simd_horizontal_sum(fiz2); - ci_f[CL_Z_OFFSET + 3] = simd_horizontal_sum(fiz3); + simd_h_dual_reduce_sum(&ci_f[CL_X_OFFSET], fix0, fix2); + simd_h_dual_reduce_sum(&ci_f[CL_Y_OFFSET], fiy0, fiy2); + simd_h_dual_reduce_sum(&ci_f[CL_Z_OFFSET], fiz0, fiz2); addStat(stats->calculated_forces, 1); addStat(stats->num_neighs, numneighs); @@ -285,7 +240,7 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta LIKWID_MARKER_STOP("force"); double E = getTimeStamp(); - DEBUG_MESSAGE("computeForceLJ_4xn end\n"); + DEBUG_MESSAGE("computeForceLJ_2xnn end\n"); return E-S; } @@ -298,7 +253,7 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_FLOAT epsilon = param->epsilon; MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq); MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6); - MD_SIMD_FLOAT epsilon_vec = simd_broadcast(epsilon); + MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon); MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0); MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5); double S = getTimeStamp(); @@ -348,7 +303,6 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]); MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]); MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]); - MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp); MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp); MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp); @@ -363,14 +317,14 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp); #if CLUSTER_M == CLUSTER_N - int cond0 = (unsigned int)(cj == ci_cj0); + unsigned int cond0 = (unsigned int)(cj == ci_cj0); MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0)); MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0)); MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0)); MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0)); #elif CLUSTER_M < CLUSTER_N - int cond0 = (unsigned int)((cj << 1) + 0 == ci); - int cond1 = (unsigned int)((cj << 1) + 1 == ci); + unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci); + unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci); MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1)); MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1)); MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1)); @@ -404,10 +358,10 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec))); MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec))); - MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, epsilon_vec)))); - MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, epsilon_vec)))); - MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, epsilon_vec)))); - MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, epsilon_vec)))); + MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec)))); + MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, eps_vec)))); + MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec)))); + MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, eps_vec)))); fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0); fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0); @@ -423,18 +377,18 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat fiz3 = simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3); } - ci_f[CL_X_OFFSET + 0] = simd_horizontal_sum(fix0); - ci_f[CL_X_OFFSET + 1] = simd_horizontal_sum(fix1); - ci_f[CL_X_OFFSET + 2] = simd_horizontal_sum(fix2); - ci_f[CL_X_OFFSET + 3] = simd_horizontal_sum(fix3); - ci_f[CL_Y_OFFSET + 0] = simd_horizontal_sum(fiy0); - ci_f[CL_Y_OFFSET + 1] = simd_horizontal_sum(fiy1); - ci_f[CL_Y_OFFSET + 2] = simd_horizontal_sum(fiy2); - ci_f[CL_Y_OFFSET + 3] = simd_horizontal_sum(fiy3); - ci_f[CL_Z_OFFSET + 0] = simd_horizontal_sum(fiz0); - ci_f[CL_Z_OFFSET + 1] = simd_horizontal_sum(fiz1); - ci_f[CL_Z_OFFSET + 2] = simd_horizontal_sum(fiz2); - ci_f[CL_Z_OFFSET + 3] = simd_horizontal_sum(fiz3); + ci_f[CL_X_OFFSET + 0] = simd_h_reduce_sum(fix0); + ci_f[CL_X_OFFSET + 1] = simd_h_reduce_sum(fix1); + ci_f[CL_X_OFFSET + 2] = simd_h_reduce_sum(fix2); + ci_f[CL_X_OFFSET + 3] = simd_h_reduce_sum(fix3); + ci_f[CL_Y_OFFSET + 0] = simd_h_reduce_sum(fiy0); + ci_f[CL_Y_OFFSET + 1] = simd_h_reduce_sum(fiy1); + ci_f[CL_Y_OFFSET + 2] = simd_h_reduce_sum(fiy2); + ci_f[CL_Y_OFFSET + 3] = simd_h_reduce_sum(fiy3); + ci_f[CL_Z_OFFSET + 0] = simd_h_reduce_sum(fiz0); + ci_f[CL_Z_OFFSET + 1] = simd_h_reduce_sum(fiz1); + ci_f[CL_Z_OFFSET + 2] = simd_h_reduce_sum(fiz2); + ci_f[CL_Z_OFFSET + 3] = simd_h_reduce_sum(fiz3); addStat(stats->calculated_forces, 1); addStat(stats->num_neighs, numneighs); diff --git a/gromacs/includes/simd.h b/gromacs/includes/simd.h index 1d6a90b..59a044a 100644 --- a/gromacs/includes/simd.h +++ b/gromacs/includes/simd.h @@ -21,6 +21,7 @@ * ======================================================================================= */ +#include #include #include #include @@ -28,7 +29,9 @@ #define SIMD_PRINT_REAL(a) simd_print_real(#a, a); #define SIMD_PRINT_MASK(a) simd_print_mask(#a, a); -#if VECTOR_WIDTH == 8 // AVX512 +#ifdef AVX512 + +#if PRECISION == 2 // Double precision #define MD_SIMD_FLOAT __m512d #define MD_SIMD_MASK __mmask8 @@ -47,15 +50,98 @@ static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_m static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); } static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_pd(p); } -static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) { +static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) { MD_SIMD_FLOAT x = _mm512_add_pd(a, _mm512_shuffle_f64x2(a, a, 0xee)); x = _mm512_add_pd(x, _mm512_shuffle_f64x2(x, x, 0x11)); x = _mm512_add_pd(x, _mm512_permute_pd(x, 0x01)); return *((MD_FLOAT *) &x); } +static inline MD_SIMD_FLOAT simd_load_h_duplicate(const double* m) { + return _mm512_broadcast_f64x4(_mm256_load_pd(m)); +} + +static inline MD_SIMD_FLOAT simd_load_h_dual(const double* m) { + return _mm512_insertf64x4(_mm512_broadcastsd_pd(_mm_load_sd(m)), _mm256_broadcastsd_pd(_mm_load_sd(m + 1)), 1); +} + +static inline double simd_h_dual_reduce_sum(double* m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) { + __m512d t0; + __m256d t2, t3; + + t0 = _mm512_add_pd(v0, _mm512_permutex_pd(v0, 0x4e)); + t0 = _mm512_mask_add_pd(t0, simd_mask_from_u32(0xccul), v1, _mm512_permutex_pd(v1, 0x4e)); + t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1)); + t0 = _mm512_mask_shuffle_f64x2(t0, simd_mask_from_u32(0xaaul), t0, t0, 0xee); + t2 = _mm512_castpd512_pd256(t0); + t3 = _mm256_load_pd(m); + t3 = _mm256_add_pd(t3, t2); + _mm256_store_pd(m, t3); + + t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0x4e)); + t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1)); + return _mm_cvtsd_f64(_mm512_castpd512_pd128(t0)); +} + +#else // Single-precision + +#define MD_SIMD_FLOAT __m512 +#define MD_SIMD_MASK __mmask16 + +static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm512_set1_ps(scalar); } +static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_ps(0.0f); } +static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_ps(a, b); } +static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_sub_ps(a, b); } +static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_mul_ps(a, b); } +static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm512_fmadd_ps(a, b, c); } +static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm512_rcp14_ps(a); } +static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm512_mask_add_ps(a, m, a, b); } +static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask16(a, b); } +static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ); } +static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask16(a); } +static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask16_u32(a); } +static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_ps(p); } +static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) { + // This would only be called in a Mx16 configuration, which is not valid in GROMACS + fprintf(stderr, "simd_h_reduce_sum(): Called with AVX512 intrinsics and single-precision which is not valid!\n"); + exit(-1); + return 0.0; +} + +static inline MD_SIMD_FLOAT simd_load_h_duplicate(const float* m) { + return _mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_load_pd((const double *)(m)))); +} + +static inline MD_SIMD_FLOAT simd_load_h_dual(const float* m) { + return _mm512_shuffle_f32x4(_mm512_broadcastss_ps(_mm_load_ss(m)), _mm512_broadcastss_ps(_mm_load_ss(m + 1)), 0x44); +} + +static inline MD_FLOAT simd_h_dual_reduce_sum(float* m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) { + __m512 t0, t1; + __m128 t2, t3; + + t0 = _mm512_shuffle_f32x4(v0, v1, 0x88); + t1 = _mm512_shuffle_f32x4(v0, v1, 0xdd); + t0 = _mm512_add_ps(t0, t1); + t0 = _mm512_add_ps(t0, _mm512_permute_ps(t0, 0x4e)); + t0 = _mm512_add_ps(t0, _mm512_permute_ps(t0, 0xb1)); + t0 = _mm512_maskz_compress_ps(simd_mask_from_u32(0x1111ul), t0); + t3 = _mm512_castps512_ps128(t0); + t2 = _mm_load_ps(m); + t2 = _mm_add_ps(t2, t3); + _mm_store_ps(m, t2); + + t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0x4e)); + t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0xb1)); + return _mm_cvtss_f32(t3); +} + +#endif // PRECISION + #else // AVX or AVX2 +#if PRECISION == 2 // Double precision + #define MD_SIMD_FLOAT __m256d #ifdef NO_AVX2 @@ -72,6 +158,7 @@ static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_pd(p); } #ifdef NO_AVX2 + static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(a))); } static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return simd_add(simd_mul(a, b), c); } static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return simd_add(a, _mm256_and_pd(b, m)); } @@ -85,7 +172,7 @@ static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { } // TODO: Implement this, althrough it is just required for debugging static inline int simd_mask_to_u32(MD_SIMD_MASK a) { return 0; } -static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) { +static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) { __m128d a0, a1; a = _mm256_add_pd(a, _mm256_permute_pd(a, 0b0101)); a0 = _mm256_castpd256_pd128(a); @@ -93,7 +180,9 @@ static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) { a0 = _mm_add_sd(a0, a1); return *((MD_FLOAT *) &a0); } -#else + +#else // AVX2 + static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_pd(a); } static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_pd(a, b, c); } static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_pd(a, m, a, b); } @@ -101,7 +190,7 @@ static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); } static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); } static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); } -static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) { +static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) { __m128d a0, a1; // test with shuffle & add as an alternative to hadd later a = _mm256_hadd_pd(a, a); @@ -110,11 +199,53 @@ static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) { a0 = _mm_add_sd(a0, a1); return *((MD_FLOAT *) &a0); } -#endif - #endif +#else // Single-precision + +#define MD_SIMD_FLOAT __m256 + +#ifdef NO_AVX2 +#define MD_SIMD_MASK __m256 +#else +#define MD_SIMD_MASK __mmask8 +#endif + +static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm256_set1_ps(scalar); } +static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_ps(0.0); } +static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_ps(a, b); } +static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_ps(a, b); } +static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_ps(a, b); } +static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_ps(p); } + +#ifdef NO_AVX2 + +#error "AVX intrinsincs with single-precision not implemented!" + +#else // AVX2 + +static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_ps(a); } +static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_ps(a, b, c); } +static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_ps(a, m, a, b); } +static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_pd_mask(a, b, _CMP_LT_OQ); } +static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); } +static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); } +static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); } +static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) { + __m128 t0; + t0 = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 0x1)); + t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2))); + t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1))); + return *((MD_FLOAT *) &t0); +} + +#endif // NO_AVX2 + +#endif // PRECISION + +#endif // AVX or AVX2 + static inline void simd_print_real(const char *ref, MD_SIMD_FLOAT a) { double x[VECTOR_WIDTH]; memcpy(x, &a, sizeof(x)); diff --git a/include_ISA.mk b/include_ISA.mk index 215f626..fa9b90b 100644 --- a/include_ISA.mk +++ b/include_ISA.mk @@ -1,15 +1,18 @@ ifeq ($(strip $(ISA)), SSE) -VECTOR_WIDTH=2 +_VECTOR_WIDTH=2 else ifeq ($(strip $(ISA)), AVX) # Vector width is 4 but AVX2 instruction set is not supported NO_AVX2=true -VECTOR_WIDTH=4 +_VECTOR_WIDTH=4 else ifeq ($(strip $(ISA)), AVX2) -VECTOR_WIDTH=4 +_VECTOR_WIDTH=4 else ifeq ($(strip $(ISA)), AVX512) -VECTOR_WIDTH=8 +AVX512=true +_VECTOR_WIDTH=8 endif ifeq ($(strip $(DATA_TYPE)), SP) -VECTOR_WIDTH=$((VECTOR_WIDTH * 2)) +VECTOR_WIDTH=$(shell echo $$(( $(_VECTOR_WIDTH) * 2 ))) +else +VECTOR_WIDTH=$(_VECTOR_WIDTH) endif diff --git a/lammps/main-stub.c b/lammps/main-stub.c index 4da90db..c7596e7 100644 --- a/lammps/main-stub.c +++ b/lammps/main-stub.c @@ -217,8 +217,6 @@ int main(int argc, const char *argv[]) { if(!csv) { printf("Number of timesteps: %d\n", param.ntimes); - printf("Number of times to compute the atoms loop: %d\n", ATOMS_LOOP_RUNS); - printf("Number of times to compute the neighbors loop: %d\n", NEIGHBORS_LOOP_RUNS); printf("System size (unit cells): %dx%dx%d\n", param.nx, param.ny, param.nz); printf("Atoms per unit cell: %d\n", atoms_per_unit_cell); printf("Total number of atoms: %d\n", atom->Nlocal); @@ -257,9 +255,8 @@ int main(int argc, const char *argv[]) { E = getTimeStamp(); double T_accum = E-S; double freq_hz = param.proc_freq * 1.e9; - const double repeats = ATOMS_LOOP_RUNS * NEIGHBORS_LOOP_RUNS; - const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes * repeats); - const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes * repeats) * freq_hz; + const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes); + const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes) * freq_hz; const double cycles_per_neigh = cycles_per_atom / (double)(atoms_per_unit_cell - 1); if(!csv) {