Add diagonal checks
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
This commit is contained in:
		| @@ -51,6 +51,8 @@ void createAtom(Atom *atom, Parameter *param) { | |||||||
|     atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); |     atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); | ||||||
|     atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); |     atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); | ||||||
|     atom->exclusion_filter = allocate(ALIGNMENT, CLUSTER_M * VECTOR_WIDTH * sizeof(MD_UINT)); |     atom->exclusion_filter = allocate(ALIGNMENT, CLUSTER_M * VECTOR_WIDTH * sizeof(MD_UINT)); | ||||||
|  |     atom->diagonal_4xn_j_minus_i = allocate(ALIGNMENT, MAX(CLUSTER_M, VECTOR_WIDTH) * sizeof(MD_UINT)); | ||||||
|  |     atom->diagonal_2xnn_j_minus_i = allocate(ALIGNMENT, VECTOR_WIDTH * sizeof(MD_UINT)); | ||||||
|  |  | ||||||
|     for(int i = 0; i < atom->ntypes * atom->ntypes; i++) { |     for(int i = 0; i < atom->ntypes * atom->ntypes; i++) { | ||||||
|         atom->epsilon[i] = param->epsilon; |         atom->epsilon[i] = param->epsilon; | ||||||
| @@ -59,6 +61,15 @@ void createAtom(Atom *atom, Parameter *param) { | |||||||
|         atom->cutforcesq[i] = param->cutforce * param->cutforce; |         atom->cutforcesq[i] = param->cutforce * param->cutforce; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     for(int j = 0; j < MAX(CLUSTER_M, VECTOR_WIDTH); j++) {    | ||||||
|  |         atom->diagonal_4xn_j_minus_i[j] = j - 0.5; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     for(int j = 0; j < VECTOR_WIDTH / 2; j++) { | ||||||
|  |         atom->diagonal_2xnn_j_minus_i[j] = j - 0.5; | ||||||
|  |         atom->diagonal_2xnn_j_minus_i[VECTOR_WIDTH / 2 + j] = j - 1 - 0.5; | ||||||
|  |     } | ||||||
|  |  | ||||||
|     for(int i = 0; i < CLUSTER_M * VECTOR_WIDTH; i++) { |     for(int i = 0; i < CLUSTER_M * VECTOR_WIDTH; i++) { | ||||||
|         atom->exclusion_filter[i] = (1U << i); |         atom->exclusion_filter[i] = (1U << i); | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -23,10 +23,8 @@ static inline void gmx_load_simd_2xnn_interactions( | |||||||
|  |  | ||||||
|     //SimdInt32 mask_pr_S(excl); |     //SimdInt32 mask_pr_S(excl); | ||||||
|     MD_SIMD_INT32 mask_pr_S = simd_int32_broadcast(excl); |     MD_SIMD_INT32 mask_pr_S = simd_int32_broadcast(excl); | ||||||
|     *interact0 = cvtIB2B(simd_test_bits(mask_pr_S)); |     *interact0 = cvtIB2B(simd_test_bits(mask_pr_S & filter0)); | ||||||
|     *interact2 = cvtIB2B(simd_test_bits(mask_pr_S)); |     *interact2 = cvtIB2B(simd_test_bits(mask_pr_S & filter2)); | ||||||
|     //*interact0 = cvtIB2B(simd_test_bits(mask_pr_S & filter0)); |  | ||||||
|     //*interact2 = cvtIB2B(simd_test_bits(mask_pr_S & filter2)); |  | ||||||
| } | } | ||||||
|  |  | ||||||
| static inline void gmx_load_simd_4xn_interactions( | static inline void gmx_load_simd_4xn_interactions( | ||||||
| @@ -188,6 +186,32 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor | |||||||
|     MD_SIMD_BITMASK filter0 = simd_load_bitmask((const int *) &atom->exclusion_filter[0 * (VECTOR_WIDTH / UNROLL_J)]); |     MD_SIMD_BITMASK filter0 = simd_load_bitmask((const int *) &atom->exclusion_filter[0 * (VECTOR_WIDTH / UNROLL_J)]); | ||||||
|     MD_SIMD_BITMASK filter2 = simd_load_bitmask((const int *) &atom->exclusion_filter[2 * (VECTOR_WIDTH / UNROLL_J)]); |     MD_SIMD_BITMASK filter2 = simd_load_bitmask((const int *) &atom->exclusion_filter[2 * (VECTOR_WIDTH / UNROLL_J)]); | ||||||
|  |  | ||||||
|  |     MD_SIMD_FLOAT diagonal_jmi_S = simd_load(atom->diagonal_2xnn_j_minus_i); | ||||||
|  |     MD_SIMD_FLOAT zero_S = simd_broadcast(0.0); | ||||||
|  |     MD_SIMD_FLOAT one_S = simd_broadcast(1.0); | ||||||
|  |  | ||||||
|  |     /* | ||||||
|  |     #if UNROLL_I == UNROLL_J | ||||||
|  |     MD_SIMD_MASK diagonal_mask_S0, diagonal_mask_S2; | ||||||
|  |     diagonal_mask0 = (zero_S < diagonal_jmi_S); | ||||||
|  |     diagonal_jmi_S = diagonal_jmi_S - one_S; | ||||||
|  |     diagonal_jmi_S = diagonal_jmi_S - one_S; | ||||||
|  |     diagonal_mask2 = (zero_S < diagonal_jmi_S); | ||||||
|  |     #elif 2 * UNROLL_I == UNROLL_J | ||||||
|  |     */ | ||||||
|  |     MD_SIMD_MASK diagonal_mask00, diagonal_mask02, diagonal_mask10, diagonal_mask12; | ||||||
|  |     diagonal_mask00 = simd_mask_cond_lt(zero_S, diagonal_jmi_S); | ||||||
|  |     diagonal_jmi_S = diagonal_jmi_S - one_S; | ||||||
|  |     diagonal_jmi_S = diagonal_jmi_S - one_S; | ||||||
|  |     diagonal_mask02 = simd_mask_cond_lt(zero_S, diagonal_jmi_S); | ||||||
|  |     diagonal_jmi_S = diagonal_jmi_S - one_S; | ||||||
|  |     diagonal_jmi_S = diagonal_jmi_S - one_S; | ||||||
|  |     diagonal_mask10 = simd_mask_cond_lt(zero_S, diagonal_jmi_S); | ||||||
|  |     diagonal_jmi_S = diagonal_jmi_S - one_S; | ||||||
|  |     diagonal_jmi_S = diagonal_jmi_S - one_S; | ||||||
|  |     diagonal_mask12 = simd_mask_cond_lt(zero_S, diagonal_jmi_S); | ||||||
|  |     //#endif | ||||||
|  |  | ||||||
|     #pragma omp for |     #pragma omp for | ||||||
|     for(int ci = 0; ci < atom->Nclusters_local; ci++) { |     for(int ci = 0; ci < atom->Nclusters_local; ci++) { | ||||||
|         int ci_cj0 = CJ0_FROM_CI(ci); |         int ci_cj0 = CJ0_FROM_CI(ci); | ||||||
| @@ -222,6 +246,8 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor | |||||||
|             MD_SIMD_MASK interact0; |             MD_SIMD_MASK interact0; | ||||||
|             MD_SIMD_MASK interact2; |             MD_SIMD_MASK interact2; | ||||||
|  |  | ||||||
|  |             gmx_load_simd_2xnn_interactions((int)imask, filter0, filter2, &interact0, &interact2); | ||||||
|  |  | ||||||
|             MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]); |             MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]); | ||||||
|             MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]); |             MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]); | ||||||
|             MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]); |             MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]); | ||||||
| @@ -231,14 +257,28 @@ double computeForceLJ_2xnn_half(Parameter *param, Atom *atom, Neighbor *neighbor | |||||||
|             MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp); |             MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp); | ||||||
|             MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp); |             MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp); | ||||||
|             MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp); |             MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp); | ||||||
|  |  | ||||||
|             gmx_load_simd_2xnn_interactions((int) imask, filter0, filter2, &interact0, &interact2); |  | ||||||
|  |  | ||||||
|             MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0))); |             MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0))); | ||||||
|             MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2))); |             MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2))); | ||||||
|  |  | ||||||
|             MD_SIMD_MASK cutoff_mask0 = simd_mask_and(interact0, simd_mask_cond_lt(rsq0, cutforcesq_vec)); |             MD_SIMD_MASK cutoff_mask0 = simd_mask_cond_lt(rsq0, cutforcesq_vec); | ||||||
|             MD_SIMD_MASK cutoff_mask2 = simd_mask_and(interact2, simd_mask_cond_lt(rsq2, cutforcesq_vec)); |             MD_SIMD_MASK cutoff_mask2 = simd_mask_cond_lt(rsq2, cutforcesq_vec); | ||||||
|  |  | ||||||
|  |             /*#if UNROLL_J == UNROLL_I | ||||||
|  |             if(cj == ci) { | ||||||
|  |                 cutoff_mask0 = cutoff_mask0 && diagonal_mask0; | ||||||
|  |                 cutoff_mask2 = cutoff_mask2 && diagonal_mask2; | ||||||
|  |             } | ||||||
|  |             #elif UNROLL_J == 2 * UNROLL_I*/ | ||||||
|  |             if(cj * 2 == ci) { | ||||||
|  |                 cutoff_mask0 = cutoff_mask0 && diagonal_mask00; | ||||||
|  |                 cutoff_mask2 = cutoff_mask2 && diagonal_mask02; | ||||||
|  |             } else if (cj * 2 + 1 == ci) { | ||||||
|  |                 cutoff_mask0 = cutoff_mask0 && diagonal_mask10; | ||||||
|  |                 cutoff_mask2 = cutoff_mask2 && diagonal_mask12; | ||||||
|  |             } | ||||||
|  |             /*else | ||||||
|  |             #   error "Invalid cluster configuration!" | ||||||
|  |             #endif*/ | ||||||
|  |  | ||||||
|             MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0); |             MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0); | ||||||
|             MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2); |             MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2); | ||||||
|   | |||||||
| @@ -33,12 +33,14 @@ | |||||||
| #   if VECTOR_WIDTH > CLUSTER_M * 2 | #   if VECTOR_WIDTH > CLUSTER_M * 2 | ||||||
| #       define KERNEL_NAME          "Simd2xNN" | #       define KERNEL_NAME          "Simd2xNN" | ||||||
| #       define CLUSTER_N            (VECTOR_WIDTH / 2) | #       define CLUSTER_N            (VECTOR_WIDTH / 2) | ||||||
|  | #       define UNROLL_I             4 | ||||||
| #       define UNROLL_J             2 | #       define UNROLL_J             2 | ||||||
| #       define computeForceLJ       computeForceLJ_2xnn | #       define computeForceLJ       computeForceLJ_2xnn | ||||||
| // Simd4xN | // Simd4xN | ||||||
| #   else | #   else | ||||||
| #       define KERNEL_NAME          "Simd4xN" | #       define KERNEL_NAME          "Simd4xN" | ||||||
| #       define CLUSTER_N            VECTOR_WIDTH | #       define CLUSTER_N            VECTOR_WIDTH | ||||||
|  | #       define UNROLL_I             4 | ||||||
| #       define UNROLL_J             1 | #       define UNROLL_J             1 | ||||||
| #       define computeForceLJ       computeForceLJ_4xn | #       define computeForceLJ       computeForceLJ_4xn | ||||||
| #   endif | #   endif | ||||||
| @@ -120,6 +122,8 @@ typedef struct { | |||||||
|     int *icluster_bin; |     int *icluster_bin; | ||||||
|     int dummy_cj; |     int dummy_cj; | ||||||
|     MD_UINT *exclusion_filter; |     MD_UINT *exclusion_filter; | ||||||
|  |     MD_FLOAT *diagonal_4xn_j_minus_i; | ||||||
|  |     MD_FLOAT *diagonal_2xnn_j_minus_i; | ||||||
| } Atom; | } Atom; | ||||||
|  |  | ||||||
| extern void initAtom(Atom*); | extern void initAtom(Atom*); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user