From 4090f430956b6f261190f8d9d0aa877b05383881 Mon Sep 17 00:00:00 2001 From: Rafael Ravedutti Date: Wed, 16 Mar 2022 17:54:52 +0100 Subject: [PATCH] Optimize partial forces reduction for compute_4xn kernel Signed-off-by: Rafael Ravedutti --- gromacs/force_lj.c | 21 ++++++++++++++++--- gromacs/includes/simd/avx512_double.h | 30 +++++++++++++++++++++++---- 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/gromacs/force_lj.c b/gromacs/force_lj.c index afdc311..bb2585d 100644 --- a/gromacs/force_lj.c +++ b/gromacs/force_lj.c @@ -239,9 +239,9 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta fiz2 = simd_masked_add(fiz2, simd_mul(delz2, force2), cutoff_mask2); } - simd_h_dual_reduce_sum(&ci_f[CL_X_OFFSET], fix0, fix2); - simd_h_dual_reduce_sum(&ci_f[CL_Y_OFFSET], fiy0, fiy2); - simd_h_dual_reduce_sum(&ci_f[CL_Z_OFFSET], fiz0, fiz2); + simd_h_dual_incr_reduced_sum(&ci_f[CL_X_OFFSET], fix0, fix2); + simd_h_dual_incr_reduced_sum(&ci_f[CL_Y_OFFSET], fiy0, fiy2); + simd_h_dual_incr_reduced_sum(&ci_f[CL_Z_OFFSET], fiz0, fiz2); addStat(stats->calculated_forces, 1); addStat(stats->num_neighs, numneighs); @@ -269,6 +269,16 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat double S = getTimeStamp(); LIKWID_MARKER_START("force"); + for(int ci = 0; ci < atom->Nclusters_local; ci++) { + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; + for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) { + ci_f[CL_X_OFFSET + cii] = 0.0; + ci_f[CL_Y_OFFSET + cii] = 0.0; + ci_f[CL_Z_OFFSET + cii] = 0.0; + } + } + #pragma omp parallel for for(int ci = 0; ci < atom->Nclusters_local; ci++) { int ci_cj0 = CJ0_FROM_CI(ci); @@ -387,6 +397,10 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat fiz3 = simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3); } + simd_incr_reduced_sum(&ci_f[CL_X_OFFSET], fix0, fix1, fix2, fix3); + simd_incr_reduced_sum(&ci_f[CL_Y_OFFSET], fiy0, fiy1, fiy2, fiy3); + simd_incr_reduced_sum(&ci_f[CL_Z_OFFSET], fiz0, fiz1, fiz2, fiz3); + /* ci_f[CL_X_OFFSET + 0] = simd_h_reduce_sum(fix0); ci_f[CL_X_OFFSET + 1] = simd_h_reduce_sum(fix1); ci_f[CL_X_OFFSET + 2] = simd_h_reduce_sum(fix2); @@ -399,6 +413,7 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat ci_f[CL_Z_OFFSET + 1] = simd_h_reduce_sum(fiz1); ci_f[CL_Z_OFFSET + 2] = simd_h_reduce_sum(fiz2); ci_f[CL_Z_OFFSET + 3] = simd_h_reduce_sum(fiz3); + */ addStat(stats->calculated_forces, 1); addStat(stats->num_neighs, numneighs); diff --git a/gromacs/includes/simd/avx512_double.h b/gromacs/includes/simd/avx512_double.h index be45949..fabc2dc 100644 --- a/gromacs/includes/simd/avx512_double.h +++ b/gromacs/includes/simd/avx512_double.h @@ -29,7 +29,7 @@ #define MD_SIMD_FLOAT __m512d #define MD_SIMD_MASK __mmask8 -static inline MD_SIMD_FLOAT simd_broadcast(double scalar) { return _mm512_set1_pd(scalar); } +static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm512_set1_pd(scalar); } static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_pd(0.0); } static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_pd(a, b); } static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_sub_pd(a, b); } @@ -50,15 +50,37 @@ static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) { return *((MD_FLOAT *) &x); } -static inline MD_SIMD_FLOAT simd_load_h_duplicate(const double* m) { +static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1, MD_SIMD_FLOAT v2, MD_SIMD_FLOAT v3) { + __m512d t0, t2; + __m256d t3, t4; + + t0 = _mm512_add_pd(v0, _mm512_permute_pd(v0, 0x55)); + t2 = _mm512_add_pd(v2, _mm512_permute_pd(v2, 0x55)); + t0 = _mm512_mask_add_pd(t0, simd_mask_from_u32(0xaa), v1, _mm512_permute_pd(v1, 0x55)); + t2 = _mm512_mask_add_pd(t2, simd_mask_from_u32(0xaa), v3, _mm512_permute_pd(v3, 0x55)); + t0 = _mm512_add_pd(t0, _mm512_shuffle_f64x2(t0, t0, 0x4e)); + t0 = _mm512_mask_add_pd(t0, simd_mask_from_u32(0xF0), t2, _mm512_shuffle_f64x2(t2, t2, 0x4e)); + t0 = _mm512_add_pd(t0, _mm512_shuffle_f64x2(t0, t0, 0xb1)); + t0 = _mm512_mask_shuffle_f64x2(t0, simd_mask_from_u32(0x0C), t0, t0, 0xee); + t3 = _mm512_castpd512_pd256(t0); + t4 = _mm256_load_pd(m); + t4 = _mm256_add_pd(t4, t3); + _mm256_store_pd(m, t4); + + t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0x4e)); + t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1)); + return _mm_cvtsd_f64(_mm512_castpd512_pd128(t0)); +} + +static inline MD_SIMD_FLOAT simd_load_h_duplicate(const MD_FLOAT *m) { return _mm512_broadcast_f64x4(_mm256_load_pd(m)); } -static inline MD_SIMD_FLOAT simd_load_h_dual(const double* m) { +static inline MD_SIMD_FLOAT simd_load_h_dual(const MD_FLOAT *m) { return _mm512_insertf64x4(_mm512_broadcastsd_pd(_mm_load_sd(m)), _mm256_broadcastsd_pd(_mm_load_sd(m + 1)), 1); } -static inline double simd_h_dual_reduce_sum(double* m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) { +static inline MD_FLOAT simd_h_dual_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) { __m512d t0; __m256d t2, t3;