Add first kernel using SIMD instrinsics for 4xn cases
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
This commit is contained in:
parent
5fd2d422ee
commit
6ad1e58a3e
7
Makefile
7
Makefile
@ -15,6 +15,9 @@ INCLUDES += -I./$(SRC_DIR)/includes
|
|||||||
ifeq ($(strip $(DATA_LAYOUT)),AOS)
|
ifeq ($(strip $(DATA_LAYOUT)),AOS)
|
||||||
DEFINES += -DAOS
|
DEFINES += -DAOS
|
||||||
endif
|
endif
|
||||||
|
ifeq ($(strip $(CLUSTER_LAYOUT)),AOS)
|
||||||
|
DEFINES += -DCLUSTER_AOS
|
||||||
|
endif
|
||||||
ifeq ($(strip $(DATA_TYPE)),SP)
|
ifeq ($(strip $(DATA_TYPE)),SP)
|
||||||
DEFINES += -DPRECISION=1
|
DEFINES += -DPRECISION=1
|
||||||
else
|
else
|
||||||
@ -49,6 +52,10 @@ ifeq ($(strip $(COMPUTE_STATS)),true)
|
|||||||
DEFINES += -DCOMPUTE_STATS
|
DEFINES += -DCOMPUTE_STATS
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(strip $(USE_REFERENCE_VERSION)),true)
|
||||||
|
DEFINES += -DUSE_REFERENCE_VERSION
|
||||||
|
endif
|
||||||
|
|
||||||
ifneq ($(VECTOR_WIDTH),)
|
ifneq ($(VECTOR_WIDTH),)
|
||||||
DEFINES += -DVECTOR_WIDTH=$(VECTOR_WIDTH)
|
DEFINES += -DVECTOR_WIDTH=$(VECTOR_WIDTH)
|
||||||
endif
|
endif
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
# Compiler tag (GCC/CLANG/ICC)
|
# Compiler tag (GCC/CLANG/ICC)
|
||||||
TAG ?= ICC
|
TAG ?= ICC
|
||||||
# Optimization scheme (lammps/gromacs/clusters_per_bin)
|
# Optimization scheme (lammps/gromacs/clusters_per_bin)
|
||||||
OPT_SCHEME = lammps
|
OPT_SCHEME = gromacs
|
||||||
# Enable likwid (true or false)
|
# Enable likwid (true or false)
|
||||||
ENABLE_LIKWID ?= false
|
ENABLE_LIKWID ?= false
|
||||||
# SP or DP
|
# SP or DP
|
||||||
@ -26,6 +26,12 @@ VECTOR_WIDTH ?= 8
|
|||||||
# Compute statistics
|
# Compute statistics
|
||||||
COMPUTE_STATS ?= true
|
COMPUTE_STATS ?= true
|
||||||
|
|
||||||
|
# Configurations for gromacs optimization scheme
|
||||||
|
# AOS or SOA
|
||||||
|
CLUSTER_LAYOUT ?= SOA
|
||||||
|
# Use reference version
|
||||||
|
USE_REFERENCE_VERSION ?= true
|
||||||
|
|
||||||
#Feature options
|
#Feature options
|
||||||
OPTIONS = -DALIGNMENT=64
|
OPTIONS = -DALIGNMENT=64
|
||||||
#OPTIONS += More options
|
#OPTIONS += More options
|
||||||
|
@ -284,7 +284,7 @@ void growClusters(Atom *atom)
|
|||||||
int nold = atom->Nclusters_max;
|
int nold = atom->Nclusters_max;
|
||||||
atom->Nclusters_max += DELTA;
|
atom->Nclusters_max += DELTA;
|
||||||
atom->clusters = (Cluster*) reallocate(atom->clusters, ALIGNMENT, atom->Nclusters_max * sizeof(Cluster), nold * sizeof(Cluster));
|
atom->clusters = (Cluster*) reallocate(atom->clusters, ALIGNMENT, atom->Nclusters_max * sizeof(Cluster), nold * sizeof(Cluster));
|
||||||
atom->cl_x = (MD_FLOAT*) reallocate(atom->cl_x, ALIGNMENT, atom->Nclusters_max * CLUSTER_DIM_N * 3 * sizeof(MD_FLOAT), nold * CLUSTER_DIM_N * 3 * sizeof(MD_FLOAT));
|
atom->cl_x = (MD_FLOAT*) reallocate(atom->cl_x, ALIGNMENT, atom->Nclusters_max * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT));
|
||||||
atom->cl_f = (MD_FLOAT*) reallocate(atom->cl_f, ALIGNMENT, atom->Nclusters_max * CLUSTER_DIM_N * 3 * sizeof(MD_FLOAT), nold * CLUSTER_DIM_N * 3 * sizeof(MD_FLOAT));
|
atom->cl_f = (MD_FLOAT*) reallocate(atom->cl_f, ALIGNMENT, atom->Nclusters_max * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT));
|
||||||
atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_DIM_N * 3 * sizeof(MD_FLOAT), nold * CLUSTER_DIM_N * 3 * sizeof(MD_FLOAT));
|
atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT));
|
||||||
}
|
}
|
||||||
|
@ -29,9 +29,10 @@
|
|||||||
#include <atom.h>
|
#include <atom.h>
|
||||||
#include <stats.h>
|
#include <stats.h>
|
||||||
#include <util.h>
|
#include <util.h>
|
||||||
|
#include <simd.h>
|
||||||
|
|
||||||
|
|
||||||
double computeForceLJ(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||||
DEBUG_MESSAGE("computeForceLJ begin\n");
|
DEBUG_MESSAGE("computeForceLJ begin\n");
|
||||||
int Nlocal = atom->Nlocal;
|
int Nlocal = atom->Nlocal;
|
||||||
int* neighs;
|
int* neighs;
|
||||||
@ -69,7 +70,7 @@ double computeForceLJ(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *s
|
|||||||
MD_FLOAT fiy = 0;
|
MD_FLOAT fiy = 0;
|
||||||
MD_FLOAT fiz = 0;
|
MD_FLOAT fiz = 0;
|
||||||
|
|
||||||
for(int cjj = 0; cjj < CLUSTER_DIM_N; cjj++) {
|
for(int cjj = 0; cjj < CLUSTER_DIM_M; cjj++) {
|
||||||
if(ci != cj || cii != cjj) {
|
if(ci != cj || cii != cjj) {
|
||||||
MD_FLOAT delx = xtmp - cluster_x(cjptr, cjj);
|
MD_FLOAT delx = xtmp - cluster_x(cjptr, cjj);
|
||||||
MD_FLOAT dely = ytmp - cluster_y(cjptr, cjj);
|
MD_FLOAT dely = ytmp - cluster_y(cjptr, cjj);
|
||||||
@ -101,3 +102,153 @@ double computeForceLJ(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *s
|
|||||||
DEBUG_MESSAGE("computeForceLJ end\n");
|
DEBUG_MESSAGE("computeForceLJ end\n");
|
||||||
return E-S;
|
return E-S;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||||
|
DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
|
||||||
|
int Nlocal = atom->Nlocal;
|
||||||
|
int* neighs;
|
||||||
|
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||||
|
MD_FLOAT sigma6 = param->sigma6;
|
||||||
|
MD_FLOAT epsilon = param->epsilon;
|
||||||
|
MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
|
||||||
|
MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
|
||||||
|
MD_SIMD_FLOAT epsilon_vec = simd_broadcast(epsilon);
|
||||||
|
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
||||||
|
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
||||||
|
const int unroll_j = CLUSTER_DIM_N / CLUSTER_DIM_M;
|
||||||
|
|
||||||
|
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||||
|
MD_FLOAT *fptr = cluster_force_ptr(ci);
|
||||||
|
for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) {
|
||||||
|
cluster_x(fptr, cii) = 0.0;
|
||||||
|
cluster_y(fptr, cii) = 0.0;
|
||||||
|
cluster_z(fptr, cii) = 0.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
double S = getTimeStamp();
|
||||||
|
LIKWID_MARKER_START("force");
|
||||||
|
|
||||||
|
#pragma omp parallel for
|
||||||
|
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||||
|
MD_FLOAT *ciptr = cluster_pos_ptr(ci);
|
||||||
|
MD_FLOAT *cifptr = cluster_force_ptr(ci);
|
||||||
|
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||||
|
int numneighs = neighbor->numneigh[ci];
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT xi0_tmp = simd_broadcast(cluster_x(ciptr, 0));
|
||||||
|
MD_SIMD_FLOAT yi0_tmp = simd_broadcast(cluster_y(ciptr, 0));
|
||||||
|
MD_SIMD_FLOAT zi0_tmp = simd_broadcast(cluster_z(ciptr, 0));
|
||||||
|
MD_SIMD_FLOAT xi1_tmp = simd_broadcast(cluster_x(ciptr, 1));
|
||||||
|
MD_SIMD_FLOAT yi1_tmp = simd_broadcast(cluster_y(ciptr, 1));
|
||||||
|
MD_SIMD_FLOAT zi1_tmp = simd_broadcast(cluster_z(ciptr, 1));
|
||||||
|
MD_SIMD_FLOAT xi2_tmp = simd_broadcast(cluster_x(ciptr, 2));
|
||||||
|
MD_SIMD_FLOAT yi2_tmp = simd_broadcast(cluster_y(ciptr, 2));
|
||||||
|
MD_SIMD_FLOAT zi2_tmp = simd_broadcast(cluster_z(ciptr, 2));
|
||||||
|
MD_SIMD_FLOAT xi3_tmp = simd_broadcast(cluster_x(ciptr, 3));
|
||||||
|
MD_SIMD_FLOAT yi3_tmp = simd_broadcast(cluster_y(ciptr, 3));
|
||||||
|
MD_SIMD_FLOAT zi3_tmp = simd_broadcast(cluster_z(ciptr, 3));
|
||||||
|
MD_SIMD_FLOAT fix0 = simd_zero();
|
||||||
|
MD_SIMD_FLOAT fiy0 = simd_zero();
|
||||||
|
MD_SIMD_FLOAT fiz0 = simd_zero();
|
||||||
|
MD_SIMD_FLOAT fix1 = simd_zero();
|
||||||
|
MD_SIMD_FLOAT fiy1 = simd_zero();
|
||||||
|
MD_SIMD_FLOAT fiz1 = simd_zero();
|
||||||
|
MD_SIMD_FLOAT fix2 = simd_zero();
|
||||||
|
MD_SIMD_FLOAT fiy2 = simd_zero();
|
||||||
|
MD_SIMD_FLOAT fiz2 = simd_zero();
|
||||||
|
MD_SIMD_FLOAT fix3 = simd_zero();
|
||||||
|
MD_SIMD_FLOAT fiy3 = simd_zero();
|
||||||
|
MD_SIMD_FLOAT fiz3 = simd_zero();
|
||||||
|
|
||||||
|
for(int k = 0; k < numneighs; k += unroll_j) {
|
||||||
|
int cj0 = neighs[k + 0];
|
||||||
|
int cj1 = neighs[k + 1];
|
||||||
|
unsigned int cond0 = (unsigned int)(ci == cj0);
|
||||||
|
unsigned int cond1 = (unsigned int)(ci == cj1);
|
||||||
|
MD_FLOAT *cj0_ptr = cluster_pos_ptr(cj0);
|
||||||
|
MD_FLOAT *cj1_ptr = cluster_pos_ptr(cj1);
|
||||||
|
MD_SIMD_FLOAT xj_tmp = simd_gather2(cj0_ptr, cj1_ptr, 0);
|
||||||
|
MD_SIMD_FLOAT yj_tmp = simd_gather2(cj0_ptr, cj1_ptr, 1);
|
||||||
|
MD_SIMD_FLOAT zj_tmp = simd_gather2(cj0_ptr, cj1_ptr, 2);
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
|
||||||
|
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
|
||||||
|
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
|
||||||
|
MD_SIMD_FLOAT delx1 = simd_sub(xi1_tmp, xj_tmp);
|
||||||
|
MD_SIMD_FLOAT dely1 = simd_sub(yi1_tmp, yj_tmp);
|
||||||
|
MD_SIMD_FLOAT delz1 = simd_sub(zi1_tmp, zj_tmp);
|
||||||
|
MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
|
||||||
|
MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
|
||||||
|
MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
|
||||||
|
MD_SIMD_FLOAT delx3 = simd_sub(xi3_tmp, xj_tmp);
|
||||||
|
MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp);
|
||||||
|
MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp);
|
||||||
|
|
||||||
|
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x80 * cond0 - 0x8 * cond1));
|
||||||
|
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x40 * cond0 - 0x4 * cond1));
|
||||||
|
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x20 * cond0 - 0x2 * cond1));
|
||||||
|
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0x10 * cond0 - 0x1 * cond1));
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
|
||||||
|
MD_SIMD_FLOAT rsq1 = simd_fma(delx1, delx1, simd_fma(dely1, dely1, simd_mul(delz1, delz1)));
|
||||||
|
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
|
||||||
|
MD_SIMD_FLOAT rsq3 = simd_fma(delx3, delx3, simd_fma(dely3, dely3, simd_mul(delz3, delz3)));
|
||||||
|
|
||||||
|
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
|
||||||
|
MD_SIMD_MASK cutoff_mask1 = simd_mask_and(excl_mask1, simd_mask_cond_lt(rsq1, cutforcesq_vec));
|
||||||
|
MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
|
||||||
|
MD_SIMD_MASK cutoff_mask3 = simd_mask_and(excl_mask3, simd_mask_cond_lt(rsq3, cutforcesq_vec));
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
||||||
|
MD_SIMD_FLOAT sr2_1 = simd_reciprocal(rsq1);
|
||||||
|
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||||
|
MD_SIMD_FLOAT sr2_3 = simd_reciprocal(rsq3);
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
|
||||||
|
MD_SIMD_FLOAT sr6_1 = simd_mul(sr2_1, simd_mul(sr2_1, simd_mul(sr2_1, sigma6_vec)));
|
||||||
|
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
|
||||||
|
MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec)));
|
||||||
|
|
||||||
|
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, epsilon_vec))));
|
||||||
|
MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, epsilon_vec))));
|
||||||
|
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, epsilon_vec))));
|
||||||
|
MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, epsilon_vec))));
|
||||||
|
|
||||||
|
simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
|
||||||
|
simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
|
||||||
|
simd_masked_add(fiz0, simd_mul(delz0, force0), cutoff_mask0);
|
||||||
|
simd_masked_add(fix1, simd_mul(delx1, force1), cutoff_mask1);
|
||||||
|
simd_masked_add(fiy1, simd_mul(dely1, force1), cutoff_mask1);
|
||||||
|
simd_masked_add(fiz1, simd_mul(delz1, force1), cutoff_mask1);
|
||||||
|
simd_masked_add(fix2, simd_mul(delx2, force2), cutoff_mask2);
|
||||||
|
simd_masked_add(fiy2, simd_mul(dely2, force2), cutoff_mask2);
|
||||||
|
simd_masked_add(fiz2, simd_mul(delz2, force2), cutoff_mask2);
|
||||||
|
simd_masked_add(fix3, simd_mul(delx3, force3), cutoff_mask3);
|
||||||
|
simd_masked_add(fiy3, simd_mul(dely3, force3), cutoff_mask3);
|
||||||
|
simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3);
|
||||||
|
}
|
||||||
|
|
||||||
|
cluster_x(cifptr, 0) = simd_horizontal_sum(fix0);
|
||||||
|
cluster_y(cifptr, 0) = simd_horizontal_sum(fiy0);
|
||||||
|
cluster_z(cifptr, 0) = simd_horizontal_sum(fiz0);
|
||||||
|
cluster_x(cifptr, 1) = simd_horizontal_sum(fix1);
|
||||||
|
cluster_y(cifptr, 1) = simd_horizontal_sum(fiy1);
|
||||||
|
cluster_z(cifptr, 1) = simd_horizontal_sum(fiz1);
|
||||||
|
cluster_x(cifptr, 2) = simd_horizontal_sum(fix2);
|
||||||
|
cluster_y(cifptr, 2) = simd_horizontal_sum(fiy2);
|
||||||
|
cluster_z(cifptr, 2) = simd_horizontal_sum(fiz2);
|
||||||
|
cluster_x(cifptr, 3) = simd_horizontal_sum(fix3);
|
||||||
|
cluster_y(cifptr, 3) = simd_horizontal_sum(fiy3);
|
||||||
|
cluster_z(cifptr, 3) = simd_horizontal_sum(fiz3);
|
||||||
|
|
||||||
|
addStat(stats->total_force_neighs, numneighs);
|
||||||
|
addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
|
||||||
|
}
|
||||||
|
|
||||||
|
LIKWID_MARKER_STOP("force");
|
||||||
|
double E = getTimeStamp();
|
||||||
|
DEBUG_MESSAGE("computeForceLJ_4xn end\n");
|
||||||
|
return E-S;
|
||||||
|
}
|
||||||
|
@ -26,12 +26,12 @@
|
|||||||
#define __ATOM_H_
|
#define __ATOM_H_
|
||||||
|
|
||||||
#define CLUSTER_DIM_M 4
|
#define CLUSTER_DIM_M 4
|
||||||
#define CLUSTER_DIM_N 4
|
#define CLUSTER_DIM_N 8
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int bin;
|
int bin;
|
||||||
int natoms;
|
int natoms;
|
||||||
int type[CLUSTER_DIM_N];
|
int type[CLUSTER_DIM_M];
|
||||||
MD_FLOAT bbminx, bbmaxx;
|
MD_FLOAT bbminx, bbmaxx;
|
||||||
MD_FLOAT bbminy, bbmaxy;
|
MD_FLOAT bbminy, bbmaxy;
|
||||||
MD_FLOAT bbminz, bbmaxz;
|
MD_FLOAT bbminz, bbmaxz;
|
||||||
@ -62,26 +62,30 @@ extern int readAtom(Atom*, Parameter*);
|
|||||||
extern void growAtom(Atom*);
|
extern void growAtom(Atom*);
|
||||||
extern void growClusters(Atom*);
|
extern void growClusters(Atom*);
|
||||||
|
|
||||||
#define cluster_pos_ptr(ci) &(atom->cl_x[(ci) * CLUSTER_DIM_N * 3])
|
#define cluster_pos_ptr(ci) &(atom->cl_x[(ci) * CLUSTER_DIM_M * 3])
|
||||||
#define cluster_velocity_ptr(ci) &(atom->cl_v[(ci) * CLUSTER_DIM_N * 3])
|
#define cluster_velocity_ptr(ci) &(atom->cl_v[(ci) * CLUSTER_DIM_M * 3])
|
||||||
#define cluster_force_ptr(ci) &(atom->cl_f[(ci) * CLUSTER_DIM_N * 3])
|
#define cluster_force_ptr(ci) &(atom->cl_f[(ci) * CLUSTER_DIM_M * 3])
|
||||||
|
|
||||||
#ifdef AOS
|
#ifdef AOS
|
||||||
#define POS_DATA_LAYOUT "AoS"
|
#define POS_DATA_LAYOUT "AoS"
|
||||||
#define atom_x(i) atom->x[(i) * 3 + 0]
|
#define atom_x(i) atom->x[(i) * 3 + 0]
|
||||||
#define atom_y(i) atom->x[(i) * 3 + 1]
|
#define atom_y(i) atom->x[(i) * 3 + 1]
|
||||||
#define atom_z(i) atom->x[(i) * 3 + 2]
|
#define atom_z(i) atom->x[(i) * 3 + 2]
|
||||||
#define cluster_x(cptr, i) cptr[(i) * 3 + 0]
|
|
||||||
#define cluster_y(cptr, i) cptr[(i) * 3 + 1]
|
|
||||||
#define cluster_z(cptr, i) cptr[(i) * 3 + 2]
|
|
||||||
#else
|
#else
|
||||||
#define POS_DATA_LAYOUT "SoA"
|
#define POS_DATA_LAYOUT "SoA"
|
||||||
#define atom_x(i) atom->x[i]
|
#define atom_x(i) atom->x[i]
|
||||||
#define atom_y(i) atom->y[i]
|
#define atom_y(i) atom->y[i]
|
||||||
#define atom_z(i) atom->z[i]
|
#define atom_z(i) atom->z[i]
|
||||||
#define cluster_x(cptr, i) cptr[0 * CLUSTER_DIM_N + (i)]
|
#endif
|
||||||
#define cluster_y(cptr, i) cptr[1 * CLUSTER_DIM_N + (i)]
|
|
||||||
#define cluster_z(cptr, i) cptr[2 * CLUSTER_DIM_N + (i)]
|
#ifdef CLUSTER_AOS
|
||||||
|
#define cluster_x(cptr, i) cptr[(i) * 3 + 0]
|
||||||
|
#define cluster_y(cptr, i) cptr[(i) * 3 + 1]
|
||||||
|
#define cluster_z(cptr, i) cptr[(i) * 3 + 2]
|
||||||
|
#else
|
||||||
|
#define cluster_x(cptr, i) cptr[0 * CLUSTER_DIM_M + (i)]
|
||||||
|
#define cluster_y(cptr, i) cptr[1 * CLUSTER_DIM_M + (i)]
|
||||||
|
#define cluster_z(cptr, i) cptr[2 * CLUSTER_DIM_M + (i)]
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
61
gromacs/includes/simd.h
Normal file
61
gromacs/includes/simd.h
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
/*
|
||||||
|
* =======================================================================================
|
||||||
|
*
|
||||||
|
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
|
||||||
|
* Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
|
||||||
|
*
|
||||||
|
* This file is part of MD-Bench.
|
||||||
|
*
|
||||||
|
* MD-Bench is free software: you can redistribute it and/or modify it
|
||||||
|
* under the terms of the GNU Lesser General Public License as published
|
||||||
|
* by the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||||
|
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||||
|
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
|
||||||
|
* details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public License along
|
||||||
|
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
* =======================================================================================
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <immintrin.h>
|
||||||
|
#include <zmmintrin.h>
|
||||||
|
|
||||||
|
#define MD_SIMD_FLOAT __m512d
|
||||||
|
#define MD_SIMD_MASK __mmask8
|
||||||
|
|
||||||
|
static inline MD_SIMD_FLOAT simd_broadcast(double scalar) { return _mm512_set1_pd(scalar); }
|
||||||
|
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_pd(0.0); }
|
||||||
|
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_pd(a, b); }
|
||||||
|
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_sub_pd(a, b); }
|
||||||
|
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_mul_pd(a, b); }
|
||||||
|
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm512_fmadd_pd(a, b, c); }
|
||||||
|
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm512_rcp14_pd(a); }
|
||||||
|
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm512_mask_add_pd(a, m, a, b); }
|
||||||
|
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int m) { return _cvtu32_mask8(m); }
|
||||||
|
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); }
|
||||||
|
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ); }
|
||||||
|
|
||||||
|
static MD_SIMD_FLOAT simd_gather2(MD_FLOAT *c0, MD_FLOAT *c1, int d) {
|
||||||
|
MD_SIMD_FLOAT x;
|
||||||
|
#ifdef CLUSTER_AOS
|
||||||
|
__m256i aos_gather_vindex = _mm256_set_epi32(9, 6, 3, 0, 9, 6, 3, 0);
|
||||||
|
__m256i vindex = _mm256_add_epi32(aos_gather_vindex, _mm256_set1_epi32(d));
|
||||||
|
x = _mm512_mask_i32gather_pd(simd_zero(), simd_mask_from_u32(0x0f), vindex, c0, sizeof(double));
|
||||||
|
x = _mm512_mask_i32gather_pd(x, simd_mask_from_u32(0xf0), vindex, c1, sizeof(double));
|
||||||
|
#else
|
||||||
|
x = _mm512_loadu_pd(&c0[d * CLUSTER_DIM_M]);
|
||||||
|
x = _mm512_insertf64x4(x, _mm256_loadu_pd(&c1[d * CLUSTER_DIM_M]), 1);
|
||||||
|
#endif
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
|
||||||
|
MD_SIMD_FLOAT x = _mm512_add_pd(a, _mm512_shuffle_f64x2(a, a, 0xee));
|
||||||
|
x = _mm512_add_pd(x, _mm512_shuffle_f64x2(x, x, 0x11));
|
||||||
|
x = _mm512_add_pd(x, _mm512_permute_pd(x, 0x01));
|
||||||
|
return *((double *) &x);
|
||||||
|
}
|
@ -45,9 +45,16 @@
|
|||||||
|
|
||||||
#define HLINE "----------------------------------------------------------------------------\n"
|
#define HLINE "----------------------------------------------------------------------------\n"
|
||||||
|
|
||||||
extern double computeForceLJ(Parameter*, Atom*, Neighbor*, Stats*);
|
extern double computeForceLJ_ref(Parameter*, Atom*, Neighbor*, Stats*);
|
||||||
|
extern double computeForceLJ_4xn(Parameter*, Atom*, Neighbor*, Stats*);
|
||||||
extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
|
extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
|
||||||
|
|
||||||
|
#ifdef USE_REFERENCE_VERSION
|
||||||
|
# define computeForceLJ computeForceLJ_ref
|
||||||
|
#else
|
||||||
|
# define computeForceLJ computeForceLJ_4xn
|
||||||
|
#endif
|
||||||
|
|
||||||
void init(Parameter *param) {
|
void init(Parameter *param) {
|
||||||
param->input_file = NULL;
|
param->input_file = NULL;
|
||||||
param->vtk_file = NULL;
|
param->vtk_file = NULL;
|
||||||
|
@ -63,7 +63,7 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) {
|
|||||||
cutneigh = param->cutneigh;
|
cutneigh = param->cutneigh;
|
||||||
nmax = 0;
|
nmax = 0;
|
||||||
atoms_per_bin = 8;
|
atoms_per_bin = 8;
|
||||||
clusters_per_bin = (atoms_per_bin / CLUSTER_DIM_N) + 4;
|
clusters_per_bin = (atoms_per_bin / CLUSTER_DIM_M) + 4;
|
||||||
stencil = NULL;
|
stencil = NULL;
|
||||||
bins = NULL;
|
bins = NULL;
|
||||||
bincount = NULL;
|
bincount = NULL;
|
||||||
@ -91,7 +91,8 @@ void setupNeighbor(Parameter *param, Atom *atom) {
|
|||||||
MD_FLOAT zlo = 0.0; MD_FLOAT zhi = zprd;
|
MD_FLOAT zlo = 0.0; MD_FLOAT zhi = zprd;
|
||||||
|
|
||||||
MD_FLOAT atom_density = ((MD_FLOAT)(atom->Nlocal)) / ((xhi - xlo) * (yhi - ylo) * (zhi - zlo));
|
MD_FLOAT atom_density = ((MD_FLOAT)(atom->Nlocal)) / ((xhi - xlo) * (yhi - ylo) * (zhi - zlo));
|
||||||
MD_FLOAT atoms_in_cell = MAX(CLUSTER_DIM_M, CLUSTER_DIM_N);
|
//MD_FLOAT atoms_in_cell = MAX(CLUSTER_DIM_M, CLUSTER_DIM_N);
|
||||||
|
MD_FLOAT atoms_in_cell = CLUSTER_DIM_M;
|
||||||
binsizex = cbrt(atoms_in_cell / atom_density);
|
binsizex = cbrt(atoms_in_cell / atom_density);
|
||||||
binsizey = cbrt(atoms_in_cell / atom_density);
|
binsizey = cbrt(atoms_in_cell / atom_density);
|
||||||
cutneighsq = cutneigh * cutneigh;
|
cutneighsq = cutneigh * cutneigh;
|
||||||
@ -269,6 +270,12 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(CLUSTER_DIM_N > CLUSTER_DIM_M) {
|
||||||
|
while(n % (CLUSTER_DIM_N / CLUSTER_DIM_M)) {
|
||||||
|
neighptr[n++] = nall - 1; // Last cluster is always a dummy cluster
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
neighbor->numneigh[ci] = n;
|
neighbor->numneigh[ci] = n;
|
||||||
if(n >= neighbor->maxneighs) {
|
if(n >= neighbor->maxneighs) {
|
||||||
resize = 1;
|
resize = 1;
|
||||||
@ -302,7 +309,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
|||||||
atom->clusters[ci].bbminz,
|
atom->clusters[ci].bbminz,
|
||||||
atom->clusters[ci].bbmaxz);
|
atom->clusters[ci].bbmaxz);
|
||||||
|
|
||||||
for(int cii = 0; cii < CLUSTER_DIM_N; cii++) {
|
for(int cii = 0; cii < CLUSTER_DIM_M; cii++) {
|
||||||
DEBUG_MESSAGE("%f, %f, %f\n", cluster_x(ciptr, cii), cluster_y(ciptr, cii), cluster_z(ciptr, cii));
|
DEBUG_MESSAGE("%f, %f, %f\n", cluster_x(ciptr, cii), cluster_y(ciptr, cii), cluster_z(ciptr, cii));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -320,7 +327,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
|||||||
atom->clusters[cj].bbminz,
|
atom->clusters[cj].bbminz,
|
||||||
atom->clusters[cj].bbmaxz);
|
atom->clusters[cj].bbmaxz);
|
||||||
|
|
||||||
for(int cjj = 0; cjj < CLUSTER_DIM_N; cjj++) {
|
for(int cjj = 0; cjj < CLUSTER_DIM_M; cjj++) {
|
||||||
DEBUG_MESSAGE(" %f, %f, %f\n", cluster_x(cjptr, cjj), cluster_y(cjptr, cjj), cluster_z(cjptr, cjj));
|
DEBUG_MESSAGE(" %f, %f, %f\n", cluster_x(cjptr, cjj), cluster_y(cjptr, cjj), cluster_z(cjptr, cjj));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -481,7 +488,7 @@ void buildClusters(Atom *atom) {
|
|||||||
MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
|
MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
|
||||||
atom->clusters[ci].natoms = 0;
|
atom->clusters[ci].natoms = 0;
|
||||||
|
|
||||||
for(int cii = 0; cii < CLUSTER_DIM_N; cii++) {
|
for(int cii = 0; cii < CLUSTER_DIM_M; cii++) {
|
||||||
if(ac < c) {
|
if(ac < c) {
|
||||||
int i = bins[bin * atoms_per_bin + ac];
|
int i = bins[bin * atoms_per_bin + ac];
|
||||||
MD_FLOAT xtmp = atom_x(i);
|
MD_FLOAT xtmp = atom_x(i);
|
||||||
@ -546,7 +553,7 @@ void binClusters(Atom *atom) {
|
|||||||
atom->clusters[ci].bbminz,
|
atom->clusters[ci].bbminz,
|
||||||
atom->clusters[ci].bbmaxz);
|
atom->clusters[ci].bbmaxz);
|
||||||
|
|
||||||
for(int cii = 0; cii < CLUSTER_DIM_N; cii++) {
|
for(int cii = 0; cii < CLUSTER_DIM_M; cii++) {
|
||||||
DEBUG_MESSAGE("%f, %f, %f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii));
|
DEBUG_MESSAGE("%f, %f, %f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -210,6 +210,18 @@ void setupPbc(Atom *atom, Parameter *param) {
|
|||||||
if (bbmaxy >= (yprd-Cutneigh) && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); }
|
if (bbmaxy >= (yprd-Cutneigh) && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(atom->Nclusters_local + Nghost + 1 >= atom->Nclusters_max) {
|
||||||
|
growClusters(atom);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add dummy cluster at the end
|
||||||
|
MD_FLOAT *cptr = cluster_pos_ptr(atom->Nclusters_local + Nghost + 1);
|
||||||
|
for(int cii = 0; cii < CLUSTER_DIM_M; cii++) {
|
||||||
|
cluster_x(cptr, cii) = INFINITY;
|
||||||
|
cluster_y(cptr, cii) = INFINITY;
|
||||||
|
cluster_z(cptr, cii) = INFINITY;
|
||||||
|
}
|
||||||
|
|
||||||
// increase by one to make it the ghost atom count
|
// increase by one to make it the ghost atom count
|
||||||
atom->Nghost = Nghost_atoms;
|
atom->Nghost = Nghost_atoms;
|
||||||
atom->Nclusters_ghost = Nghost + 1;
|
atom->Nclusters_ghost = Nghost + 1;
|
||||||
|
Loading…
Reference in New Issue
Block a user