Commit version that works for M=N
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
This commit is contained in:
parent
2b441e691e
commit
22d0f0b958
@ -37,6 +37,7 @@ void initAtom(Atom *atom) {
|
||||
atom->cl_x = NULL;
|
||||
atom->cl_v = NULL;
|
||||
atom->cl_f = NULL;
|
||||
atom->cl_type = NULL;
|
||||
atom->Natoms = 0;
|
||||
atom->Nlocal = 0;
|
||||
atom->Nghost = 0;
|
||||
@ -53,6 +54,7 @@ void initAtom(Atom *atom) {
|
||||
atom->cutneighsq = NULL;
|
||||
atom->iclusters = NULL;
|
||||
atom->jclusters = NULL;
|
||||
atom->icluster_bin = NULL;
|
||||
}
|
||||
|
||||
void createAtom(Atom *atom, Parameter *param) {
|
||||
@ -431,7 +433,9 @@ void growClusters(Atom *atom) {
|
||||
atom->Nclusters_max += DELTA;
|
||||
atom->iclusters = (Cluster*) reallocate(atom->iclusters, ALIGNMENT, atom->Nclusters_max * sizeof(Cluster), nold * sizeof(Cluster));
|
||||
atom->jclusters = (Cluster*) reallocate(atom->jclusters, ALIGNMENT, atom->Nclusters_max * jfac * sizeof(Cluster), nold * jfac * sizeof(Cluster));
|
||||
atom->icluster_bin = (int*) reallocate(atom->icluster_bin, ALIGNMENT, atom->Nclusters_max * sizeof(int), nold * sizeof(int));
|
||||
atom->cl_x = (MD_FLOAT*) reallocate(atom->cl_x, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
atom->cl_f = (MD_FLOAT*) reallocate(atom->cl_f, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
atom->cl_type = (int*) reallocate(atom->cl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * sizeof(int), nold * CLUSTER_M * sizeof(int));
|
||||
}
|
||||
|
@ -293,13 +293,15 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
MD_SIMD_FLOAT epsilon_vec = simd_broadcast(epsilon);
|
||||
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
||||
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
||||
const int unroll_j = VECTOR_WIDTH / CLUSTER_N;
|
||||
|
||||
double S = getTimeStamp();
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
#pragma omp parallel for
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_cj0 = CJ0_FROM_CI(ci);
|
||||
#if CLUSTER_M > CLUSTER_N
|
||||
int ci_cj1 = CJ1_FROM_CI(ci);
|
||||
#endif
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
@ -331,11 +333,9 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
MD_SIMD_FLOAT fiy3 = simd_zero();
|
||||
MD_SIMD_FLOAT fiz3 = simd_zero();
|
||||
|
||||
for(int k = 0; k < numneighs; k += unroll_j) {
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int cj = neighs[k + 0];
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
unsigned int cond0 = (unsigned int)(ci == cj);
|
||||
unsigned int cond1 = cond0;
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
|
||||
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
|
||||
@ -354,16 +354,26 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp);
|
||||
|
||||
#if VECTOR_WIDTH == 8
|
||||
#if CLUSTER_M == CLUSTER_N
|
||||
int cond0 = (unsigned int)(cj == ci_cj0);
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0));
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0));
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0));
|
||||
#elif CLUSTER_M < CLUSTER_N
|
||||
int cond0 = (unsigned int)(cj == (ci << 1) + 0);
|
||||
int cond1 = (unsigned int)(cj == (ci << 1) + 1);
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1));
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1));
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1));
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1));
|
||||
#else
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0));
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0));
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0));
|
||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond0));
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0x3 - 0x2 * cond0));
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond1));
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0x3 - 0x2 * cond1));
|
||||
#endif
|
||||
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
|
||||
|
@ -33,12 +33,23 @@
|
||||
// Simd4xN: M=4, N=VECTOR_WIDTH
|
||||
// Simd2xNN: M=4, N=(VECTOR_WIDTH/2)
|
||||
|
||||
// Simd2XNN (here used for single-precision)
|
||||
// Simd2xNN (here used for single-precision)
|
||||
#if VECTOR_WIDTH > CLUSTER_M * 2
|
||||
# define KERNEL_NAME "Simd2xNN"
|
||||
# define CLUSTER_N (VECTOR_WIDTH / 2)
|
||||
# define computeForceLJ computeForceLJ_2xnn
|
||||
// Simd4xN
|
||||
#else
|
||||
# define KERNEL_NAME "Simd4xN"
|
||||
# define CLUSTER_N VECTOR_WIDTH
|
||||
# define computeForceLJ computeForceLJ_4xn
|
||||
#endif
|
||||
|
||||
#ifdef USE_REFERENCE_VERSION
|
||||
# undef KERNEL_NAME
|
||||
# undef computeForceLJ
|
||||
# define KERNEL_NAME "Reference"
|
||||
# define computeForceLJ computeForceLJ_ref
|
||||
#endif
|
||||
|
||||
#if CLUSTER_M == CLUSTER_N
|
||||
@ -105,6 +116,7 @@ typedef struct {
|
||||
MD_FLOAT *cl_f;
|
||||
int *cl_type;
|
||||
Cluster *iclusters, *jclusters;
|
||||
int *icluster_bin;
|
||||
} Atom;
|
||||
|
||||
extern void initAtom(Atom*);
|
||||
|
@ -43,14 +43,9 @@
|
||||
|
||||
extern double computeForceLJ_ref(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceLJ_4xn(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceLJ_2xnn(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
|
||||
|
||||
#ifdef USE_REFERENCE_VERSION
|
||||
# define computeForceLJ computeForceLJ_ref
|
||||
#else
|
||||
# define computeForceLJ computeForceLJ_4xn
|
||||
#endif
|
||||
|
||||
double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
if(param->force_field == FF_EAM) { initEam(eam, param); }
|
||||
double S, E;
|
||||
@ -325,6 +320,7 @@ int main(int argc, char** argv) {
|
||||
}
|
||||
|
||||
printf(HLINE);
|
||||
printf("Kernel: %s, MxN: %dx%d, Vector width: %d\n", KERNEL_NAME, CLUSTER_M, CLUSTER_N, VECTOR_WIDTH);
|
||||
printf("Data layout for positions: %s\n", POS_DATA_LAYOUT);
|
||||
#if PRECISION == 1
|
||||
printf("Using single precision floating point.\n");
|
||||
|
@ -41,7 +41,6 @@ static int *bincount;
|
||||
static int *bins;
|
||||
static int *bin_nclusters;
|
||||
static int *bin_clusters;
|
||||
static int *icluster_bin;
|
||||
static int mbins; //total number of bins
|
||||
static int atoms_per_bin; // max atoms per bin
|
||||
static int clusters_per_bin; // max clusters per bin
|
||||
@ -64,7 +63,7 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) {
|
||||
cutneigh = param->cutneigh;
|
||||
nmax = 0;
|
||||
atoms_per_bin = 8;
|
||||
clusters_per_bin = (atoms_per_bin / CLUSTER_M) + 4;
|
||||
clusters_per_bin = (atoms_per_bin / CLUSTER_M) + 10;
|
||||
stencil = NULL;
|
||||
bins = NULL;
|
||||
bincount = NULL;
|
||||
@ -227,7 +226,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
|
||||
int n = 0;
|
||||
int ibin = icluster_bin[ci];
|
||||
int ibin = atom->icluster_bin[ci];
|
||||
MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx;
|
||||
MD_FLOAT ibb_xmax = atom->iclusters[ci].bbmaxx;
|
||||
MD_FLOAT ibb_ymin = atom->iclusters[ci].bbminy;
|
||||
@ -590,7 +589,7 @@ void buildClusters(Atom *atom) {
|
||||
if(bbminz > ztmp) { bbminz = ztmp; }
|
||||
if(bbmaxz < ztmp) { bbmaxz = ztmp; }
|
||||
|
||||
atom->cl_type[cii] = atom->type[i];
|
||||
ci_type[cii] = atom->type[i];
|
||||
atom->iclusters[ci].natoms++;
|
||||
} else {
|
||||
ci_x[CL_X_OFFSET + cii] = INFINITY;
|
||||
@ -601,7 +600,7 @@ void buildClusters(Atom *atom) {
|
||||
ac++;
|
||||
}
|
||||
|
||||
icluster_bin[ci] = bin;
|
||||
atom->icluster_bin[ci] = bin;
|
||||
atom->iclusters[ci].bbminx = bbminx;
|
||||
atom->iclusters[ci].bbmaxx = bbmaxx;
|
||||
atom->iclusters[ci].bbminy = bbminy;
|
||||
@ -711,7 +710,7 @@ void binClusters(Atom *atom) {
|
||||
MD_FLOAT *cptr = cluster_pos_ptr(ci);
|
||||
DEBUG_MESSAGE("Cluster %d:\n", ci);
|
||||
DEBUG_MESSAGE("bin=%d, Natoms=%d, bbox={%f,%f},{%f,%f},{%f,%f}\n",
|
||||
icluster_bin[ci],
|
||||
atom->icluster_bin[ci],
|
||||
atom->clusters[ci].natoms,
|
||||
atom->clusters[ci].bbminx,
|
||||
atom->clusters[ci].bbmaxx,
|
||||
@ -745,7 +744,7 @@ void binClusters(Atom *atom) {
|
||||
for(int ci = 0; ci < nlocal && !resize; ci++) {
|
||||
// Assure we add this j-cluster only once in the bin
|
||||
if(!(CLUSTER_M < CLUSTER_N && ci % 2)) {
|
||||
int bin = icluster_bin[ci];
|
||||
int bin = atom->icluster_bin[ci];
|
||||
int c = bin_nclusters[bin];
|
||||
if(c + 1 < clusters_per_bin) {
|
||||
bin_clusters[bin * clusters_per_bin + c] = CJ0_FROM_CI(ci);
|
||||
@ -827,7 +826,7 @@ void binClusters(Atom *atom) {
|
||||
bin_clusters[bin * clusters_per_bin + c] = cj;
|
||||
}
|
||||
|
||||
icluster_bin[ci] = bin;
|
||||
atom->icluster_bin[ci] = bin;
|
||||
bin_nclusters[bin]++;
|
||||
} else {
|
||||
resize = 1;
|
||||
|
@ -46,6 +46,7 @@ void initPbc(Atom* atom) {
|
||||
/* update coordinates of ghost atoms */
|
||||
/* uses mapping created in setupPbc */
|
||||
void updatePbc(Atom *atom, Parameter *param, int firstUpdate) {
|
||||
DEBUG_MESSAGE("updatePbc start\n");
|
||||
int *border_map = atom->border_map;
|
||||
int nlocal = atom->Nclusters_local;
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
@ -97,6 +98,8 @@ void updatePbc(Atom *atom, Parameter *param, int firstUpdate) {
|
||||
atom->iclusters[ci].bbmaxz = bbmaxz;
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("updatePbc end\n");
|
||||
}
|
||||
|
||||
/* relocate atoms that have left domain according
|
||||
@ -159,6 +162,7 @@ void growPbc(Atom* atom) {
|
||||
}
|
||||
|
||||
void setupPbc(Atom *atom, Parameter *param) {
|
||||
DEBUG_MESSAGE("setupPbc start\n");
|
||||
int *border_map = atom->border_map;
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
@ -236,4 +240,5 @@ void setupPbc(Atom *atom, Parameter *param) {
|
||||
|
||||
// Update created ghost clusters positions
|
||||
updatePbc(atom, param, 1);
|
||||
DEBUG_MESSAGE("setupPbc end\n");
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user