Optimize partial forces reduction for compute_4xn kernel
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
This commit is contained in:
parent
f3263a2d48
commit
4090f43095
@ -239,9 +239,9 @@ double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Sta
|
||||
fiz2 = simd_masked_add(fiz2, simd_mul(delz2, force2), cutoff_mask2);
|
||||
}
|
||||
|
||||
simd_h_dual_reduce_sum(&ci_f[CL_X_OFFSET], fix0, fix2);
|
||||
simd_h_dual_reduce_sum(&ci_f[CL_Y_OFFSET], fiy0, fiy2);
|
||||
simd_h_dual_reduce_sum(&ci_f[CL_Z_OFFSET], fiz0, fiz2);
|
||||
simd_h_dual_incr_reduced_sum(&ci_f[CL_X_OFFSET], fix0, fix2);
|
||||
simd_h_dual_incr_reduced_sum(&ci_f[CL_Y_OFFSET], fiy0, fiy2);
|
||||
simd_h_dual_incr_reduced_sum(&ci_f[CL_Z_OFFSET], fiz0, fiz2);
|
||||
|
||||
addStat(stats->calculated_forces, 1);
|
||||
addStat(stats->num_neighs, numneighs);
|
||||
@ -269,6 +269,16 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
double S = getTimeStamp();
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
|
||||
ci_f[CL_X_OFFSET + cii] = 0.0;
|
||||
ci_f[CL_Y_OFFSET + cii] = 0.0;
|
||||
ci_f[CL_Z_OFFSET + cii] = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
#pragma omp parallel for
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_cj0 = CJ0_FROM_CI(ci);
|
||||
@ -387,6 +397,10 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
fiz3 = simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3);
|
||||
}
|
||||
|
||||
simd_incr_reduced_sum(&ci_f[CL_X_OFFSET], fix0, fix1, fix2, fix3);
|
||||
simd_incr_reduced_sum(&ci_f[CL_Y_OFFSET], fiy0, fiy1, fiy2, fiy3);
|
||||
simd_incr_reduced_sum(&ci_f[CL_Z_OFFSET], fiz0, fiz1, fiz2, fiz3);
|
||||
/*
|
||||
ci_f[CL_X_OFFSET + 0] = simd_h_reduce_sum(fix0);
|
||||
ci_f[CL_X_OFFSET + 1] = simd_h_reduce_sum(fix1);
|
||||
ci_f[CL_X_OFFSET + 2] = simd_h_reduce_sum(fix2);
|
||||
@ -399,6 +413,7 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
ci_f[CL_Z_OFFSET + 1] = simd_h_reduce_sum(fiz1);
|
||||
ci_f[CL_Z_OFFSET + 2] = simd_h_reduce_sum(fiz2);
|
||||
ci_f[CL_Z_OFFSET + 3] = simd_h_reduce_sum(fiz3);
|
||||
*/
|
||||
|
||||
addStat(stats->calculated_forces, 1);
|
||||
addStat(stats->num_neighs, numneighs);
|
||||
|
@ -29,7 +29,7 @@
|
||||
#define MD_SIMD_FLOAT __m512d
|
||||
#define MD_SIMD_MASK __mmask8
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_broadcast(double scalar) { return _mm512_set1_pd(scalar); }
|
||||
static inline MD_SIMD_FLOAT simd_broadcast(MD_FLOAT scalar) { return _mm512_set1_pd(scalar); }
|
||||
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_pd(0.0); }
|
||||
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_pd(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_sub_pd(a, b); }
|
||||
@ -50,15 +50,37 @@ static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
|
||||
return *((MD_FLOAT *) &x);
|
||||
}
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const double* m) {
|
||||
static inline MD_FLOAT simd_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1, MD_SIMD_FLOAT v2, MD_SIMD_FLOAT v3) {
|
||||
__m512d t0, t2;
|
||||
__m256d t3, t4;
|
||||
|
||||
t0 = _mm512_add_pd(v0, _mm512_permute_pd(v0, 0x55));
|
||||
t2 = _mm512_add_pd(v2, _mm512_permute_pd(v2, 0x55));
|
||||
t0 = _mm512_mask_add_pd(t0, simd_mask_from_u32(0xaa), v1, _mm512_permute_pd(v1, 0x55));
|
||||
t2 = _mm512_mask_add_pd(t2, simd_mask_from_u32(0xaa), v3, _mm512_permute_pd(v3, 0x55));
|
||||
t0 = _mm512_add_pd(t0, _mm512_shuffle_f64x2(t0, t0, 0x4e));
|
||||
t0 = _mm512_mask_add_pd(t0, simd_mask_from_u32(0xF0), t2, _mm512_shuffle_f64x2(t2, t2, 0x4e));
|
||||
t0 = _mm512_add_pd(t0, _mm512_shuffle_f64x2(t0, t0, 0xb1));
|
||||
t0 = _mm512_mask_shuffle_f64x2(t0, simd_mask_from_u32(0x0C), t0, t0, 0xee);
|
||||
t3 = _mm512_castpd512_pd256(t0);
|
||||
t4 = _mm256_load_pd(m);
|
||||
t4 = _mm256_add_pd(t4, t3);
|
||||
_mm256_store_pd(m, t4);
|
||||
|
||||
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0x4e));
|
||||
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1));
|
||||
return _mm_cvtsd_f64(_mm512_castpd512_pd128(t0));
|
||||
}
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const MD_FLOAT *m) {
|
||||
return _mm512_broadcast_f64x4(_mm256_load_pd(m));
|
||||
}
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_load_h_dual(const double* m) {
|
||||
static inline MD_SIMD_FLOAT simd_load_h_dual(const MD_FLOAT *m) {
|
||||
return _mm512_insertf64x4(_mm512_broadcastsd_pd(_mm_load_sd(m)), _mm256_broadcastsd_pd(_mm_load_sd(m + 1)), 1);
|
||||
}
|
||||
|
||||
static inline double simd_h_dual_reduce_sum(double* m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
|
||||
static inline MD_FLOAT simd_h_dual_incr_reduced_sum(MD_FLOAT *m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
|
||||
__m512d t0;
|
||||
__m256d t2, t3;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user