diff --git a/Makefile b/Makefile index 83a25ad..354d33d 100644 --- a/Makefile +++ b/Makefile @@ -152,6 +152,7 @@ $(BUILD_DIR)/%.o: %.s clean: $(info ===> CLEAN) @rm -rf $(BUILD_DIR) + @rm -rf MDBench-$(IDENTIFIER) @rm -f tags cleanall: diff --git a/gromacs/atom.c b/gromacs/atom.c index bfb2ab7..eef4654 100644 --- a/gromacs/atom.c +++ b/gromacs/atom.c @@ -37,6 +37,7 @@ void initAtom(Atom *atom) { atom->iclusters = NULL; atom->jclusters = NULL; atom->icluster_bin = NULL; + initMasks(atom); } void createAtom(Atom *atom, Parameter *param) { @@ -50,9 +51,6 @@ void createAtom(Atom *atom, Parameter *param) { atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); - atom->exclusion_filter = allocate(ALIGNMENT, CLUSTER_M * VECTOR_WIDTH * sizeof(MD_UINT)); - atom->diagonal_4xn_j_minus_i = allocate(ALIGNMENT, MAX(CLUSTER_M, VECTOR_WIDTH) * sizeof(MD_UINT)); - atom->diagonal_2xnn_j_minus_i = allocate(ALIGNMENT, VECTOR_WIDTH * sizeof(MD_UINT)); for(int i = 0; i < atom->ntypes * atom->ntypes; i++) { atom->epsilon[i] = param->epsilon; @@ -61,19 +59,6 @@ void createAtom(Atom *atom, Parameter *param) { atom->cutforcesq[i] = param->cutforce * param->cutforce; } - for(int j = 0; j < MAX(CLUSTER_M, VECTOR_WIDTH); j++) { - atom->diagonal_4xn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5; - } - - for(int j = 0; j < VECTOR_WIDTH / 2; j++) { - atom->diagonal_2xnn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5; - atom->diagonal_2xnn_j_minus_i[VECTOR_WIDTH / 2 + j] = (MD_FLOAT)(j - 1) - 0.5; - } - - for(int i = 0; i < CLUSTER_M * VECTOR_WIDTH; i++) { - atom->exclusion_filter[i] = (1U << i); - } - MD_FLOAT alat = pow((4.0 / param->rho), (1.0 / 3.0)); int ilo = (int) (xlo / (0.5 * alat) - 1); int ihi = (int) (xhi / (0.5 * alat) + 1); @@ -409,6 +394,59 @@ int readAtom_dmp(Atom* atom, Parameter* param) { return natoms; } +void initMasks(Atom *atom) { + const unsigned int half_mask_bits = VECTOR_WIDTH >> 1; + unsigned int mask0, mask1, mask2, mask3; + + atom->exclusion_filter = allocate(ALIGNMENT, CLUSTER_M * VECTOR_WIDTH * sizeof(MD_UINT)); + atom->diagonal_4xn_j_minus_i = allocate(ALIGNMENT, MAX(CLUSTER_M, VECTOR_WIDTH) * sizeof(MD_UINT)); + atom->diagonal_2xnn_j_minus_i = allocate(ALIGNMENT, VECTOR_WIDTH * sizeof(MD_UINT)); + //atom->masks_2xnn = allocate(ALIGNMENT, 8 * sizeof(unsigned int)); + + for(int j = 0; j < MAX(CLUSTER_M, VECTOR_WIDTH); j++) { + atom->diagonal_4xn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5; + } + + for(int j = 0; j < VECTOR_WIDTH / 2; j++) { + atom->diagonal_2xnn_j_minus_i[j] = (MD_FLOAT)(j) - 0.5; + atom->diagonal_2xnn_j_minus_i[VECTOR_WIDTH / 2 + j] = (MD_FLOAT)(j - 1) - 0.5; + } + + for(int i = 0; i < CLUSTER_M * VECTOR_WIDTH; i++) { + atom->exclusion_filter[i] = (1U << i); + } + + #if CLUSTER_M == CLUSTER_N + for(unsigned int cond0 = 0; cond0 < 2; cond0++) { + mask0 = (unsigned int)(0xf - 0x1 * cond0); + mask1 = (unsigned int)(0xf - 0x3 * cond0); + mask2 = (unsigned int)(0xf - 0x7 * cond0); + mask3 = (unsigned int)(0xf - 0xf * cond0); + atom->masks_2xnn[cond0 * 2 + 0] = (mask1 << half_mask_bits) | mask0; + atom->masks_2xnn[cond0 * 2 + 1] = (mask3 << half_mask_bits) | mask2; + } + #else + for(unsigned int cond0 = 0; cond0 < 2; cond0++) { + for(unsigned int cond1 = 0; cond1 < 2; cond1++) { + #if CLUSTER_M < CLUSTER_N + mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1); + mask1 = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1); + mask2 = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1); + mask3 = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1); + #else + mask0 = (unsigned int)(0x3 - 0x1 * cond0); + mask1 = (unsigned int)(0x3 - 0x3 * cond0); + mask2 = (unsigned int)(0x3 - cond0 * 0x3 - 0x1 * cond1); + mask3 = (unsigned int)(0x3 - cond0 * 0x3 - 0x3 * cond1); + #endif + + atom->masks_2xnn[cond0 * 4 + cond1 * 2 + 0] = (mask1 << half_mask_bits) | mask0; + atom->masks_2xnn[cond0 * 4 + cond1 * 2 + 1] = (mask3 << half_mask_bits) | mask2; + } + } + #endif +} + void growAtom(Atom *atom) { int nold = atom->Nmax; atom->Nmax += DELTA; diff --git a/gromacs/force_lj.c b/gromacs/force_lj.c index 923d4ef..4070ae1 100644 --- a/gromacs/force_lj.c +++ b/gromacs/force_lj.c @@ -165,7 +165,6 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon); MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0); MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5); - const unsigned int half_mask_bits = VECTOR_WIDTH >> 1; for(int ci = 0; ci < atom->Nclusters_local; ci++) { int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); @@ -236,7 +235,6 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor MD_SIMD_FLOAT fiz2 = simd_zero(); for(int k = 0; k < numneighs; k++) { - unsigned int mask0, mask1, mask2, mask3; int cj = neighs[k].cj; int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); int imask = neighs[k].imask; @@ -261,30 +259,23 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor #if CLUSTER_M == CLUSTER_N unsigned int cond0 = (unsigned int)(cj == ci_cj0); - mask0 = (unsigned int)(0xf - 0x1 * cond0); - mask1 = (unsigned int)(0xf - 0x3 * cond0); - mask2 = (unsigned int)(0xf - 0x7 * cond0); - mask3 = (unsigned int)(0xf - 0xf * cond0); - #elif CLUSTER_M < CLUSTER_N + MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_2xnn[cond0 * 2 + 0]); + MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_2xnn[cond0 * 2 + 1]); + #else + #if CLUSTER_M < CLUSTER_N unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci); unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci); - mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x1f * cond1); - mask1 = (unsigned int)(0xff - 0x3 * cond0 - 0x3f * cond1); - mask2 = (unsigned int)(0xff - 0x7 * cond0 - 0x7f * cond1); - mask3 = (unsigned int)(0xff - 0xf * cond0 - 0xff * cond1); #else unsigned int cond0 = (unsigned int)(cj == ci_cj0); unsigned int cond1 = (unsigned int)(cj == ci_cj1); - mask0 = (unsigned int)(0x3 - 0x1 * cond0); - mask1 = (unsigned int)(0x3 - 0x3 * cond0); - mask2 = (unsigned int)(0x3 - cond0 * 0x3 - 0x1 * cond1); - mask3 = (unsigned int)(0x3 - cond0 * 0x3 - 0x3 * cond1); + #endif + MD_SIMD_MASK excl_mask0 = simd_mask_from_u32(atom->masks_2xnn[cond0 * 4 + cond1 * 2 + 0]); + MD_SIMD_MASK excl_mask2 = simd_mask_from_u32(atom->masks_2xnn[cond0 * 4 + cond1 * 2 + 1]); #endif - MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0); - MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2); MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec); MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec); + cutoff_mask0 = simd_mask_and(cutoff_mask0, excl_mask0); cutoff_mask2 = simd_mask_and(cutoff_mask2, excl_mask2); @@ -308,8 +299,8 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0); MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2); - MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0; - MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2; + MD_SIMD_FLOAT sr6_0 = sr2_0 * sr2_0 * sr2_0 * sigma6_vec; + MD_SIMD_FLOAT sr6_2 = sr2_2 * sr2_2 * sr2_2 * sigma6_vec; MD_SIMD_FLOAT force0 = c48_vec * sr6_0 * (sr6_0 - c05_vec) * sr2_0 * eps_vec; MD_SIMD_FLOAT force2 = c48_vec * sr6_2 * (sr6_2 - c05_vec) * sr2_2 * eps_vec; diff --git a/gromacs/includes/atom.h b/gromacs/includes/atom.h index 92717fd..6bd3893 100644 --- a/gromacs/includes/atom.h +++ b/gromacs/includes/atom.h @@ -124,9 +124,11 @@ typedef struct { MD_UINT *exclusion_filter; MD_FLOAT *diagonal_4xn_j_minus_i; MD_FLOAT *diagonal_2xnn_j_minus_i; + unsigned int masks_2xnn[8]; } Atom; extern void initAtom(Atom*); +extern void initMasks(Atom*); extern void createAtom(Atom*, Parameter*); extern int readAtom(Atom*, Parameter*); extern int readAtom_pdb(Atom*, Parameter*);