Commit version that works for M=N

Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
This commit is contained in:
Rafael Ravedutti 2022-03-10 01:31:50 +01:00
parent 2b441e691e
commit 22d0f0b958
6 changed files with 51 additions and 25 deletions

View File

@ -37,6 +37,7 @@ void initAtom(Atom *atom) {
atom->cl_x = NULL; atom->cl_x = NULL;
atom->cl_v = NULL; atom->cl_v = NULL;
atom->cl_f = NULL; atom->cl_f = NULL;
atom->cl_type = NULL;
atom->Natoms = 0; atom->Natoms = 0;
atom->Nlocal = 0; atom->Nlocal = 0;
atom->Nghost = 0; atom->Nghost = 0;
@ -53,6 +54,7 @@ void initAtom(Atom *atom) {
atom->cutneighsq = NULL; atom->cutneighsq = NULL;
atom->iclusters = NULL; atom->iclusters = NULL;
atom->jclusters = NULL; atom->jclusters = NULL;
atom->icluster_bin = NULL;
} }
void createAtom(Atom *atom, Parameter *param) { void createAtom(Atom *atom, Parameter *param) {
@ -431,7 +433,9 @@ void growClusters(Atom *atom) {
atom->Nclusters_max += DELTA; atom->Nclusters_max += DELTA;
atom->iclusters = (Cluster*) reallocate(atom->iclusters, ALIGNMENT, atom->Nclusters_max * sizeof(Cluster), nold * sizeof(Cluster)); atom->iclusters = (Cluster*) reallocate(atom->iclusters, ALIGNMENT, atom->Nclusters_max * sizeof(Cluster), nold * sizeof(Cluster));
atom->jclusters = (Cluster*) reallocate(atom->jclusters, ALIGNMENT, atom->Nclusters_max * jfac * sizeof(Cluster), nold * jfac * sizeof(Cluster)); atom->jclusters = (Cluster*) reallocate(atom->jclusters, ALIGNMENT, atom->Nclusters_max * jfac * sizeof(Cluster), nold * jfac * sizeof(Cluster));
atom->icluster_bin = (int*) reallocate(atom->icluster_bin, ALIGNMENT, atom->Nclusters_max * sizeof(int), nold * sizeof(int));
atom->cl_x = (MD_FLOAT*) reallocate(atom->cl_x, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT)); atom->cl_x = (MD_FLOAT*) reallocate(atom->cl_x, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
atom->cl_f = (MD_FLOAT*) reallocate(atom->cl_f, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT)); atom->cl_f = (MD_FLOAT*) reallocate(atom->cl_f, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT)); atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
atom->cl_type = (int*) reallocate(atom->cl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * sizeof(int), nold * CLUSTER_M * sizeof(int));
} }

View File

@ -293,13 +293,15 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
MD_SIMD_FLOAT epsilon_vec = simd_broadcast(epsilon); MD_SIMD_FLOAT epsilon_vec = simd_broadcast(epsilon);
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0); MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5); MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
const int unroll_j = VECTOR_WIDTH / CLUSTER_N;
double S = getTimeStamp(); double S = getTimeStamp();
LIKWID_MARKER_START("force"); LIKWID_MARKER_START("force");
#pragma omp parallel for #pragma omp parallel for
for(int ci = 0; ci < atom->Nclusters_local; ci++) { for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_cj0 = CJ0_FROM_CI(ci);
#if CLUSTER_M > CLUSTER_N
int ci_cj1 = CJ1_FROM_CI(ci);
#endif
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
@ -331,11 +333,9 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
MD_SIMD_FLOAT fiy3 = simd_zero(); MD_SIMD_FLOAT fiy3 = simd_zero();
MD_SIMD_FLOAT fiz3 = simd_zero(); MD_SIMD_FLOAT fiz3 = simd_zero();
for(int k = 0; k < numneighs; k += unroll_j) { for(int k = 0; k < numneighs; k++) {
int cj = neighs[k + 0]; int cj = neighs[k + 0];
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
unsigned int cond0 = (unsigned int)(ci == cj);
unsigned int cond1 = cond0;
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]); MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]); MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
@ -354,16 +354,26 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp); MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp);
MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp); MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp);
#if VECTOR_WIDTH == 8 #if CLUSTER_M == CLUSTER_N
int cond0 = (unsigned int)(cj == ci_cj0);
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0));
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0));
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0));
#elif CLUSTER_M < CLUSTER_N
int cond0 = (unsigned int)(cj == (ci << 1) + 0);
int cond1 = (unsigned int)(cj == (ci << 1) + 1);
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1)); MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1));
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1)); MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1));
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1)); MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1));
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1)); MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1));
#else #else
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0)); unsigned int cond0 = (unsigned int)(cj == ci_cj0);
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0)); unsigned int cond1 = (unsigned int)(cj == ci_cj1);
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0)); MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond0));
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0)); MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0x3 - 0x2 * cond0));
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond1));
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0x3 - 0x2 * cond1));
#endif #endif
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0))); MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));

View File

@ -33,12 +33,23 @@
// Simd4xN: M=4, N=VECTOR_WIDTH // Simd4xN: M=4, N=VECTOR_WIDTH
// Simd2xNN: M=4, N=(VECTOR_WIDTH/2) // Simd2xNN: M=4, N=(VECTOR_WIDTH/2)
// Simd2XNN (here used for single-precision) // Simd2xNN (here used for single-precision)
#if VECTOR_WIDTH > CLUSTER_M * 2 #if VECTOR_WIDTH > CLUSTER_M * 2
# define KERNEL_NAME "Simd2xNN"
# define CLUSTER_N (VECTOR_WIDTH / 2) # define CLUSTER_N (VECTOR_WIDTH / 2)
# define computeForceLJ computeForceLJ_2xnn
// Simd4xN // Simd4xN
#else #else
# define KERNEL_NAME "Simd4xN"
# define CLUSTER_N VECTOR_WIDTH # define CLUSTER_N VECTOR_WIDTH
# define computeForceLJ computeForceLJ_4xn
#endif
#ifdef USE_REFERENCE_VERSION
# undef KERNEL_NAME
# undef computeForceLJ
# define KERNEL_NAME "Reference"
# define computeForceLJ computeForceLJ_ref
#endif #endif
#if CLUSTER_M == CLUSTER_N #if CLUSTER_M == CLUSTER_N
@ -105,6 +116,7 @@ typedef struct {
MD_FLOAT *cl_f; MD_FLOAT *cl_f;
int *cl_type; int *cl_type;
Cluster *iclusters, *jclusters; Cluster *iclusters, *jclusters;
int *icluster_bin;
} Atom; } Atom;
extern void initAtom(Atom*); extern void initAtom(Atom*);

View File

@ -43,14 +43,9 @@
extern double computeForceLJ_ref(Parameter*, Atom*, Neighbor*, Stats*); extern double computeForceLJ_ref(Parameter*, Atom*, Neighbor*, Stats*);
extern double computeForceLJ_4xn(Parameter*, Atom*, Neighbor*, Stats*); extern double computeForceLJ_4xn(Parameter*, Atom*, Neighbor*, Stats*);
extern double computeForceLJ_2xnn(Parameter*, Atom*, Neighbor*, Stats*);
extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*); extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
#ifdef USE_REFERENCE_VERSION
# define computeForceLJ computeForceLJ_ref
#else
# define computeForceLJ computeForceLJ_4xn
#endif
double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) { double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) {
if(param->force_field == FF_EAM) { initEam(eam, param); } if(param->force_field == FF_EAM) { initEam(eam, param); }
double S, E; double S, E;
@ -325,6 +320,7 @@ int main(int argc, char** argv) {
} }
printf(HLINE); printf(HLINE);
printf("Kernel: %s, MxN: %dx%d, Vector width: %d\n", KERNEL_NAME, CLUSTER_M, CLUSTER_N, VECTOR_WIDTH);
printf("Data layout for positions: %s\n", POS_DATA_LAYOUT); printf("Data layout for positions: %s\n", POS_DATA_LAYOUT);
#if PRECISION == 1 #if PRECISION == 1
printf("Using single precision floating point.\n"); printf("Using single precision floating point.\n");

View File

@ -41,7 +41,6 @@ static int *bincount;
static int *bins; static int *bins;
static int *bin_nclusters; static int *bin_nclusters;
static int *bin_clusters; static int *bin_clusters;
static int *icluster_bin;
static int mbins; //total number of bins static int mbins; //total number of bins
static int atoms_per_bin; // max atoms per bin static int atoms_per_bin; // max atoms per bin
static int clusters_per_bin; // max clusters per bin static int clusters_per_bin; // max clusters per bin
@ -64,7 +63,7 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) {
cutneigh = param->cutneigh; cutneigh = param->cutneigh;
nmax = 0; nmax = 0;
atoms_per_bin = 8; atoms_per_bin = 8;
clusters_per_bin = (atoms_per_bin / CLUSTER_M) + 4; clusters_per_bin = (atoms_per_bin / CLUSTER_M) + 10;
stencil = NULL; stencil = NULL;
bins = NULL; bins = NULL;
bincount = NULL; bincount = NULL;
@ -227,7 +226,7 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
for(int ci = 0; ci < atom->Nclusters_local; ci++) { for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]); int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
int n = 0; int n = 0;
int ibin = icluster_bin[ci]; int ibin = atom->icluster_bin[ci];
MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx; MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx;
MD_FLOAT ibb_xmax = atom->iclusters[ci].bbmaxx; MD_FLOAT ibb_xmax = atom->iclusters[ci].bbmaxx;
MD_FLOAT ibb_ymin = atom->iclusters[ci].bbminy; MD_FLOAT ibb_ymin = atom->iclusters[ci].bbminy;
@ -590,7 +589,7 @@ void buildClusters(Atom *atom) {
if(bbminz > ztmp) { bbminz = ztmp; } if(bbminz > ztmp) { bbminz = ztmp; }
if(bbmaxz < ztmp) { bbmaxz = ztmp; } if(bbmaxz < ztmp) { bbmaxz = ztmp; }
atom->cl_type[cii] = atom->type[i]; ci_type[cii] = atom->type[i];
atom->iclusters[ci].natoms++; atom->iclusters[ci].natoms++;
} else { } else {
ci_x[CL_X_OFFSET + cii] = INFINITY; ci_x[CL_X_OFFSET + cii] = INFINITY;
@ -601,7 +600,7 @@ void buildClusters(Atom *atom) {
ac++; ac++;
} }
icluster_bin[ci] = bin; atom->icluster_bin[ci] = bin;
atom->iclusters[ci].bbminx = bbminx; atom->iclusters[ci].bbminx = bbminx;
atom->iclusters[ci].bbmaxx = bbmaxx; atom->iclusters[ci].bbmaxx = bbmaxx;
atom->iclusters[ci].bbminy = bbminy; atom->iclusters[ci].bbminy = bbminy;
@ -711,7 +710,7 @@ void binClusters(Atom *atom) {
MD_FLOAT *cptr = cluster_pos_ptr(ci); MD_FLOAT *cptr = cluster_pos_ptr(ci);
DEBUG_MESSAGE("Cluster %d:\n", ci); DEBUG_MESSAGE("Cluster %d:\n", ci);
DEBUG_MESSAGE("bin=%d, Natoms=%d, bbox={%f,%f},{%f,%f},{%f,%f}\n", DEBUG_MESSAGE("bin=%d, Natoms=%d, bbox={%f,%f},{%f,%f},{%f,%f}\n",
icluster_bin[ci], atom->icluster_bin[ci],
atom->clusters[ci].natoms, atom->clusters[ci].natoms,
atom->clusters[ci].bbminx, atom->clusters[ci].bbminx,
atom->clusters[ci].bbmaxx, atom->clusters[ci].bbmaxx,
@ -745,7 +744,7 @@ void binClusters(Atom *atom) {
for(int ci = 0; ci < nlocal && !resize; ci++) { for(int ci = 0; ci < nlocal && !resize; ci++) {
// Assure we add this j-cluster only once in the bin // Assure we add this j-cluster only once in the bin
if(!(CLUSTER_M < CLUSTER_N && ci % 2)) { if(!(CLUSTER_M < CLUSTER_N && ci % 2)) {
int bin = icluster_bin[ci]; int bin = atom->icluster_bin[ci];
int c = bin_nclusters[bin]; int c = bin_nclusters[bin];
if(c + 1 < clusters_per_bin) { if(c + 1 < clusters_per_bin) {
bin_clusters[bin * clusters_per_bin + c] = CJ0_FROM_CI(ci); bin_clusters[bin * clusters_per_bin + c] = CJ0_FROM_CI(ci);
@ -827,7 +826,7 @@ void binClusters(Atom *atom) {
bin_clusters[bin * clusters_per_bin + c] = cj; bin_clusters[bin * clusters_per_bin + c] = cj;
} }
icluster_bin[ci] = bin; atom->icluster_bin[ci] = bin;
bin_nclusters[bin]++; bin_nclusters[bin]++;
} else { } else {
resize = 1; resize = 1;

View File

@ -46,6 +46,7 @@ void initPbc(Atom* atom) {
/* update coordinates of ghost atoms */ /* update coordinates of ghost atoms */
/* uses mapping created in setupPbc */ /* uses mapping created in setupPbc */
void updatePbc(Atom *atom, Parameter *param, int firstUpdate) { void updatePbc(Atom *atom, Parameter *param, int firstUpdate) {
DEBUG_MESSAGE("updatePbc start\n");
int *border_map = atom->border_map; int *border_map = atom->border_map;
int nlocal = atom->Nclusters_local; int nlocal = atom->Nclusters_local;
MD_FLOAT xprd = param->xprd; MD_FLOAT xprd = param->xprd;
@ -97,6 +98,8 @@ void updatePbc(Atom *atom, Parameter *param, int firstUpdate) {
atom->iclusters[ci].bbmaxz = bbmaxz; atom->iclusters[ci].bbmaxz = bbmaxz;
} }
} }
DEBUG_MESSAGE("updatePbc end\n");
} }
/* relocate atoms that have left domain according /* relocate atoms that have left domain according
@ -159,6 +162,7 @@ void growPbc(Atom* atom) {
} }
void setupPbc(Atom *atom, Parameter *param) { void setupPbc(Atom *atom, Parameter *param) {
DEBUG_MESSAGE("setupPbc start\n");
int *border_map = atom->border_map; int *border_map = atom->border_map;
MD_FLOAT xprd = param->xprd; MD_FLOAT xprd = param->xprd;
MD_FLOAT yprd = param->yprd; MD_FLOAT yprd = param->yprd;
@ -236,4 +240,5 @@ void setupPbc(Atom *atom, Parameter *param) {
// Update created ghost clusters positions // Update created ghost clusters positions
updatePbc(atom, param, 1); updatePbc(atom, param, 1);
DEBUG_MESSAGE("setupPbc end\n");
} }