Merge pull request #4 from RRZE-HPC/gromacs_sp

Gromacs sp
This commit is contained in:
rafaelravedutti 2022-03-15 20:31:42 +01:00 committed by GitHub
commit 459853dc25
16 changed files with 909 additions and 490 deletions

View File

@ -10,15 +10,13 @@ Q ?= @
include $(MAKE_DIR)/config.mk
include $(MAKE_DIR)/include_$(TAG).mk
include $(MAKE_DIR)/include_LIKWID.mk
include $(MAKE_DIR)/include_ISA.mk
include $(MAKE_DIR)/include_GROMACS.mk
INCLUDES += -I./$(SRC_DIR)/includes
ifeq ($(strip $(DATA_LAYOUT)),AOS)
DEFINES += -DAOS
endif
ifeq ($(strip $(CLUSTER_LAYOUT)),AOS)
DEFINES += -DCLUSTER_AOS
endif
ifeq ($(strip $(DATA_TYPE)),SP)
DEFINES += -DPRECISION=1
else
@ -29,14 +27,6 @@ ifneq ($(ASM_SYNTAX), ATT)
ASFLAGS += -masm=intel
endif
ifneq ($(ATOMS_LOOP_RUNS),)
DEFINES += -DATOMS_LOOP_RUNS=$(ATOMS_LOOP_RUNS)
endif
ifneq ($(NEIGHBORS_LOOP_RUNS),)
DEFINES += -DNEIGHBORS_LOOP_RUNS=$(NEIGHBORS_LOOP_RUNS)
endif
ifeq ($(strip $(EXPLICIT_TYPES)),true)
DEFINES += -DEXPLICIT_TYPES
endif
@ -73,6 +63,10 @@ ifeq ($(strip $(NO_AVX2)),true)
DEFINES += -DNO_AVX2
endif
ifeq ($(strip $(AVX512)),true)
DEFINES += -DAVX512
endif
VPATH = $(SRC_DIR) $(ASM_DIR)
ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s))

View File

@ -1,11 +1,13 @@
# Compiler tag (GCC/CLANG/ICC)
TAG ?= ICC
# Instruction set (SSE/AVX/AVX2/AVX512)
ISA ?= AVX512
# Optimization scheme (lammps/gromacs/clusters_per_bin)
OPT_SCHEME ?= gromacs
# Enable likwid (true or false)
ENABLE_LIKWID ?= true
# SP or DP
DATA_TYPE ?= DP
DATA_TYPE ?= SP
# AOS or SOA
DATA_LAYOUT ?= AOS
# Assembly syntax to generate (ATT/INTEL)
@ -13,26 +15,16 @@ ASM_SYNTAX ?= ATT
# Debug
DEBUG ?= false
# Number of times to run the atoms loop on stubbed variant
ATOMS_LOOP_RUNS ?= 1
# Number of times to run the neighbors loop on stubbed variant
NEIGHBORS_LOOP_RUNS ?= 1
# Explicitly store and load atom types (true or false)
EXPLICIT_TYPES ?= false
# Trace memory addresses for cache simulator (true or false)
MEM_TRACER ?= false
# Trace indexes and distances for gather-md (true or false)
INDEX_TRACER ?= false
# Vector width (elements) for index and distance tracer
VECTOR_WIDTH ?= 8
# When vector width is 4 but AVX2 is not supported (AVX only), set this to true
NO_AVX2 ?= false
# Compute statistics
COMPUTE_STATS ?= true
# Configurations for gromacs optimization scheme
# AOS or SOA
CLUSTER_LAYOUT ?= SOA
# Use reference version
USE_REFERENCE_VERSION ?= false
# Enable XTC output

View File

@ -37,6 +37,7 @@ void initAtom(Atom *atom) {
atom->cl_x = NULL;
atom->cl_v = NULL;
atom->cl_f = NULL;
atom->cl_type = NULL;
atom->Natoms = 0;
atom->Nlocal = 0;
atom->Nghost = 0;
@ -51,7 +52,9 @@ void initAtom(Atom *atom) {
atom->sigma6 = NULL;
atom->cutforcesq = NULL;
atom->cutneighsq = NULL;
atom->clusters = NULL;
atom->iclusters = NULL;
atom->jclusters = NULL;
atom->icluster_bin = NULL;
}
void createAtom(Atom *atom, Parameter *param) {
@ -426,9 +429,13 @@ void growAtom(Atom *atom) {
void growClusters(Atom *atom) {
int nold = atom->Nclusters_max;
int jterm = MAX(1, CLUSTER_M / CLUSTER_N); // If M>N, we need to allocate more j-clusters
atom->Nclusters_max += DELTA;
atom->clusters = (Cluster*) reallocate(atom->clusters, ALIGNMENT, atom->Nclusters_max * sizeof(Cluster), nold * sizeof(Cluster));
atom->cl_x = (MD_FLOAT*) reallocate(atom->cl_x, ALIGNMENT, atom->Nclusters_max * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT));
atom->cl_f = (MD_FLOAT*) reallocate(atom->cl_f, ALIGNMENT, atom->Nclusters_max * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT));
atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT));
atom->iclusters = (Cluster*) reallocate(atom->iclusters, ALIGNMENT, atom->Nclusters_max * sizeof(Cluster), nold * sizeof(Cluster));
atom->jclusters = (Cluster*) reallocate(atom->jclusters, ALIGNMENT, atom->Nclusters_max * jterm * sizeof(Cluster), nold * jterm * sizeof(Cluster));
atom->icluster_bin = (int*) reallocate(atom->icluster_bin, ALIGNMENT, atom->Nclusters_max * sizeof(int), nold * sizeof(int));
atom->cl_x = (MD_FLOAT*) reallocate(atom->cl_x, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
atom->cl_f = (MD_FLOAT*) reallocate(atom->cl_f, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
atom->cl_type = (int*) reallocate(atom->cl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * sizeof(int), nold * CLUSTER_M * sizeof(int));
}

View File

@ -41,11 +41,12 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
MD_FLOAT epsilon = param->epsilon;
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
MD_FLOAT *fptr = cluster_force_ptr(ci);
for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) {
cluster_x(fptr, cii) = 0.0;
cluster_y(fptr, cii) = 0.0;
cluster_z(fptr, cii) = 0.0;
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
ci_f[CL_X_OFFSET + cii] = 0.0;
ci_f[CL_Y_OFFSET + cii] = 0.0;
ci_f[CL_Z_OFFSET + cii] = 0.0;
}
}
@ -54,28 +55,39 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
#pragma omp parallel for
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
MD_FLOAT *ciptr = cluster_pos_ptr(ci);
MD_FLOAT *cifptr = cluster_force_ptr(ci);
int ci_cj0 = CJ0_FROM_CI(ci);
int ci_cj1 = CJ1_FROM_CI(ci);
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
int numneighs = neighbor->numneigh[ci];
for(int k = 0; k < numneighs; k++) {
int cj = neighs[k];
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
int any = 0;
MD_FLOAT *cjptr = cluster_pos_ptr(cj);
for(int cii = 0; cii < CLUSTER_DIM_M; cii++) {
MD_FLOAT xtmp = cluster_x(ciptr, cii);
MD_FLOAT ytmp = cluster_y(ciptr, cii);
MD_FLOAT ztmp = cluster_z(ciptr, cii);
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
for(int cii = 0; cii < CLUSTER_M; cii++) {
MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii];
MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii];
MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii];
MD_FLOAT fix = 0;
MD_FLOAT fiy = 0;
MD_FLOAT fiz = 0;
for(int cjj = 0; cjj < CLUSTER_DIM_M; cjj++) {
if(ci != cj || cii != cjj) {
MD_FLOAT delx = xtmp - cluster_x(cjptr, cjj);
MD_FLOAT dely = ytmp - cluster_y(cjptr, cjj);
MD_FLOAT delz = ztmp - cluster_z(cjptr, cjj);
for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
#if CLUSTER_M == CLUSTER_N
if(ci_cj0 != cj || cii != cjj) {
#elif CLUSTER_M < CLUSTER_N
if(ci_cj0 != cj || cii + CLUSTER_M * (ci & 0x1) != cjj) {
#else
if((ci_cj0 != cj || cii != cjj) && (ci_cj1 != cj || cii != cjj + CLUSTER_N)) {
#endif
MD_FLOAT delx = xtmp - cj_x[CL_X_OFFSET + cjj];
MD_FLOAT dely = ytmp - cj_x[CL_Y_OFFSET + cjj];
MD_FLOAT delz = ztmp - cj_x[CL_Z_OFFSET + cjj];
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
if(rsq < cutforcesq) {
MD_FLOAT sr2 = 1.0 / rsq;
@ -98,15 +110,15 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
addStat(stats->clusters_outside_cutoff, 1);
}
cluster_x(cifptr, cii) += fix;
cluster_y(cifptr, cii) += fiy;
cluster_z(cifptr, cii) += fiz;
ci_f[CL_X_OFFSET + cii] += fix;
ci_f[CL_Y_OFFSET + cii] += fiy;
ci_f[CL_Z_OFFSET + cii] += fiz;
}
}
addStat(stats->calculated_forces, 1);
addStat(stats->num_neighs, numneighs);
addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_DIM_M / CLUSTER_DIM_N));
addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
}
LIKWID_MARKER_STOP("force");
@ -115,6 +127,132 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
return E-S;
}
double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
int Nlocal = atom->Nlocal;
int* neighs;
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
ci_f[CL_X_OFFSET + cii] = 0.0;
ci_f[CL_Y_OFFSET + cii] = 0.0;
ci_f[CL_Z_OFFSET + cii] = 0.0;
}
}
double S = getTimeStamp();
LIKWID_MARKER_START("force");
#pragma omp parallel for
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
int numneighs = neighbor->numneigh[ci];
MD_SIMD_FLOAT xi0_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 0]);
MD_SIMD_FLOAT xi2_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 2]);
MD_SIMD_FLOAT yi0_tmp = simd_load_h_dual(&ci_x[CL_Y_OFFSET + 0]);
MD_SIMD_FLOAT yi2_tmp = simd_load_h_dual(&ci_x[CL_Y_OFFSET + 2]);
MD_SIMD_FLOAT zi0_tmp = simd_load_h_dual(&ci_x[CL_Z_OFFSET + 0]);
MD_SIMD_FLOAT zi2_tmp = simd_load_h_dual(&ci_x[CL_Z_OFFSET + 2]);
MD_SIMD_FLOAT fix0 = simd_zero();
MD_SIMD_FLOAT fiy0 = simd_zero();
MD_SIMD_FLOAT fiz0 = simd_zero();
MD_SIMD_FLOAT fix2 = simd_zero();
MD_SIMD_FLOAT fiy2 = simd_zero();
MD_SIMD_FLOAT fiz2 = simd_zero();
for(int k = 0; k < numneighs; k++) {
int cj = neighs[k];
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
unsigned int mask0, mask1, mask2, mask3;
MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
#if CLUSTER_M == CLUSTER_N
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
mask0 = (unsigned int)(0xf - 0x1 * cond0);
mask1 = (unsigned int)(0xf - 0x2 * cond0);
mask2 = (unsigned int)(0xf - 0x4 * cond0);
mask3 = (unsigned int)(0xf - 0x8 * cond0);
#elif CLUSTER_M < CLUSTER_N
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
mask1 = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
mask2 = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
mask3 = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
#else
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
mask1 = (unsigned int)(0x3 - 0x2 * cond0);
mask2 = (unsigned int)(0x3 - 0x1 * cond1);
mask3 = (unsigned int)(0x3 - 0x2 * cond1);
#endif
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0);
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2);
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
fiz0 = simd_masked_add(fiz0, simd_mul(delz0, force0), cutoff_mask0);
fix2 = simd_masked_add(fix2, simd_mul(delx2, force2), cutoff_mask2);
fiy2 = simd_masked_add(fiy2, simd_mul(dely2, force2), cutoff_mask2);
fiz2 = simd_masked_add(fiz2, simd_mul(delz2, force2), cutoff_mask2);
}
simd_h_dual_reduce_sum(&ci_f[CL_X_OFFSET], fix0, fix2);
simd_h_dual_reduce_sum(&ci_f[CL_Y_OFFSET], fiy0, fiy2);
simd_h_dual_reduce_sum(&ci_f[CL_Z_OFFSET], fiz0, fiz2);
addStat(stats->calculated_forces, 1);
addStat(stats->num_neighs, numneighs);
addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
}
LIKWID_MARKER_STOP("force");
double E = getTimeStamp();
DEBUG_MESSAGE("computeForceLJ_2xnn end\n");
return E-S;
}
double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
@ -125,33 +263,36 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
MD_FLOAT epsilon = param->epsilon;
MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
MD_SIMD_FLOAT epsilon_vec = simd_broadcast(epsilon);
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
const int unroll_j = CLUSTER_DIM_N / CLUSTER_DIM_M;
double S = getTimeStamp();
LIKWID_MARKER_START("force");
#pragma omp parallel for
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
MD_FLOAT *ciptr = cluster_pos_ptr(ci);
MD_FLOAT *cifptr = cluster_force_ptr(ci);
int ci_cj0 = CJ0_FROM_CI(ci);
#if CLUSTER_M > CLUSTER_N
int ci_cj1 = CJ1_FROM_CI(ci);
#endif
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
int numneighs = neighbor->numneigh[ci];
MD_SIMD_FLOAT xi0_tmp = simd_broadcast(cluster_x(ciptr, 0));
MD_SIMD_FLOAT yi0_tmp = simd_broadcast(cluster_y(ciptr, 0));
MD_SIMD_FLOAT zi0_tmp = simd_broadcast(cluster_z(ciptr, 0));
MD_SIMD_FLOAT xi1_tmp = simd_broadcast(cluster_x(ciptr, 1));
MD_SIMD_FLOAT yi1_tmp = simd_broadcast(cluster_y(ciptr, 1));
MD_SIMD_FLOAT zi1_tmp = simd_broadcast(cluster_z(ciptr, 1));
MD_SIMD_FLOAT xi2_tmp = simd_broadcast(cluster_x(ciptr, 2));
MD_SIMD_FLOAT yi2_tmp = simd_broadcast(cluster_y(ciptr, 2));
MD_SIMD_FLOAT zi2_tmp = simd_broadcast(cluster_z(ciptr, 2));
MD_SIMD_FLOAT xi3_tmp = simd_broadcast(cluster_x(ciptr, 3));
MD_SIMD_FLOAT yi3_tmp = simd_broadcast(cluster_y(ciptr, 3));
MD_SIMD_FLOAT zi3_tmp = simd_broadcast(cluster_z(ciptr, 3));
MD_SIMD_FLOAT xi0_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 0]);
MD_SIMD_FLOAT xi1_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 1]);
MD_SIMD_FLOAT xi2_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 2]);
MD_SIMD_FLOAT xi3_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 3]);
MD_SIMD_FLOAT yi0_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 0]);
MD_SIMD_FLOAT yi1_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 1]);
MD_SIMD_FLOAT yi2_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 2]);
MD_SIMD_FLOAT yi3_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 3]);
MD_SIMD_FLOAT zi0_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 0]);
MD_SIMD_FLOAT zi1_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 1]);
MD_SIMD_FLOAT zi2_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 2]);
MD_SIMD_FLOAT zi3_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 3]);
MD_SIMD_FLOAT fix0 = simd_zero();
MD_SIMD_FLOAT fiy0 = simd_zero();
MD_SIMD_FLOAT fiz0 = simd_zero();
@ -165,24 +306,13 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
MD_SIMD_FLOAT fiy3 = simd_zero();
MD_SIMD_FLOAT fiz3 = simd_zero();
for(int k = 0; k < numneighs; k += unroll_j) {
int cj0 = neighs[k + 0];
unsigned int cond0 = (unsigned int)(ci == cj0);
MD_FLOAT *cj0_ptr = cluster_pos_ptr(cj0);
#if VECTOR_WIDTH == 8
int cj1 = neighs[k + 1];
unsigned int cond1 = (unsigned int)(ci == cj1);
MD_FLOAT *cj1_ptr = cluster_pos_ptr(cj1);
MD_SIMD_FLOAT xj_tmp = simd_load2(cj0_ptr, cj1_ptr, 0);
MD_SIMD_FLOAT yj_tmp = simd_load2(cj0_ptr, cj1_ptr, 1);
MD_SIMD_FLOAT zj_tmp = simd_load2(cj0_ptr, cj1_ptr, 2);
#else
MD_SIMD_FLOAT xj_tmp = simd_load(cj0_ptr, 0);
MD_SIMD_FLOAT yj_tmp = simd_load(cj0_ptr, 1);
MD_SIMD_FLOAT zj_tmp = simd_load(cj0_ptr, 2);
#endif
for(int k = 0; k < numneighs; k++) {
int cj = neighs[k];
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
@ -196,16 +326,26 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp);
MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp);
#if VECTOR_WIDTH == 8
#if CLUSTER_M == CLUSTER_N
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0));
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0));
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0));
#elif CLUSTER_M < CLUSTER_N
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1));
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1));
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1));
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1));
#else
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0));
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0));
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0));
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond0));
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0x3 - 0x2 * cond0));
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond1));
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0x3 - 0x2 * cond1));
#endif
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
@ -228,10 +368,10 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec)));
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, epsilon_vec))));
MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, epsilon_vec))));
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, epsilon_vec))));
MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, epsilon_vec))));
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, eps_vec))));
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, eps_vec))));
fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
@ -247,22 +387,22 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
fiz3 = simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3);
}
cluster_x(cifptr, 0) = simd_horizontal_sum(fix0);
cluster_y(cifptr, 0) = simd_horizontal_sum(fiy0);
cluster_z(cifptr, 0) = simd_horizontal_sum(fiz0);
cluster_x(cifptr, 1) = simd_horizontal_sum(fix1);
cluster_y(cifptr, 1) = simd_horizontal_sum(fiy1);
cluster_z(cifptr, 1) = simd_horizontal_sum(fiz1);
cluster_x(cifptr, 2) = simd_horizontal_sum(fix2);
cluster_y(cifptr, 2) = simd_horizontal_sum(fiy2);
cluster_z(cifptr, 2) = simd_horizontal_sum(fiz2);
cluster_x(cifptr, 3) = simd_horizontal_sum(fix3);
cluster_y(cifptr, 3) = simd_horizontal_sum(fiy3);
cluster_z(cifptr, 3) = simd_horizontal_sum(fiz3);
ci_f[CL_X_OFFSET + 0] = simd_h_reduce_sum(fix0);
ci_f[CL_X_OFFSET + 1] = simd_h_reduce_sum(fix1);
ci_f[CL_X_OFFSET + 2] = simd_h_reduce_sum(fix2);
ci_f[CL_X_OFFSET + 3] = simd_h_reduce_sum(fix3);
ci_f[CL_Y_OFFSET + 0] = simd_h_reduce_sum(fiy0);
ci_f[CL_Y_OFFSET + 1] = simd_h_reduce_sum(fiy1);
ci_f[CL_Y_OFFSET + 2] = simd_h_reduce_sum(fiy2);
ci_f[CL_Y_OFFSET + 3] = simd_h_reduce_sum(fiy3);
ci_f[CL_Z_OFFSET + 0] = simd_h_reduce_sum(fiz0);
ci_f[CL_Z_OFFSET + 1] = simd_h_reduce_sum(fiz1);
ci_f[CL_Z_OFFSET + 2] = simd_h_reduce_sum(fiz2);
ci_f[CL_Z_OFFSET + 3] = simd_h_reduce_sum(fiz3);
addStat(stats->calculated_forces, 1);
addStat(stats->num_neighs, numneighs);
addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_DIM_M / CLUSTER_DIM_N));
addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
}
LIKWID_MARKER_STOP("force");

View File

@ -27,13 +27,71 @@
#define DELTA 20000
#define CLUSTER_DIM_M 4
#define CLUSTER_DIM_N VECTOR_WIDTH
#define CLUSTER_M 4
// Nbnxn layouts (as of GROMACS):
// Simd4xN: M=4, N=VECTOR_WIDTH
// Simd2xNN: M=4, N=(VECTOR_WIDTH/2)
// Simd2xNN (here used for single-precision)
#if VECTOR_WIDTH > CLUSTER_M * 2
# define KERNEL_NAME "Simd2xNN"
# define CLUSTER_N (VECTOR_WIDTH / 2)
# define computeForceLJ computeForceLJ_2xnn
// Simd4xN
#else
# define KERNEL_NAME "Simd4xN"
# define CLUSTER_N VECTOR_WIDTH
# define computeForceLJ computeForceLJ_4xn
#endif
#ifdef USE_REFERENCE_VERSION
# undef KERNEL_NAME
# undef computeForceLJ
# define KERNEL_NAME "Reference"
# define computeForceLJ computeForceLJ_ref
#endif
#if CLUSTER_M == CLUSTER_N
# define CJ0_FROM_CI(a) (a)
# define CJ1_FROM_CI(a) (a)
# define CI_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
# define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
#elif CLUSTER_M == CLUSTER_N * 2 // M > N
# define CJ0_FROM_CI(a) ((a) << 1)
# define CJ1_FROM_CI(a) (((a) << 1) | 0x1)
# define CI_BASE_INDEX(a,b) ((a) * CLUSTER_M * (b))
# define CJ_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_M * (b) + ((a) & 0x1) * (CLUSTER_M >> 1))
#elif CLUSTER_M == CLUSTER_N / 2 // M < N
# define CJ0_FROM_CI(a) ((a) >> 1)
# define CJ1_FROM_CI(a) ((a) >> 1)
# define CI_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_N * (b) + ((a) & 0x1) * (CLUSTER_N >> 1))
# define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
#else
# error "Invalid cluster configuration!"
#endif
#if CLUSTER_N != 2 && CLUSTER_N != 4 && CLUSTER_N != 8
# error "Cluster N dimension can be only 2, 4 and 8"
#endif
#define CI_SCALAR_BASE_INDEX(a) (CI_BASE_INDEX(a, 1))
#define CI_VECTOR_BASE_INDEX(a) (CI_BASE_INDEX(a, 3))
#define CJ_SCALAR_BASE_INDEX(a) (CJ_BASE_INDEX(a, 1))
#define CJ_VECTOR_BASE_INDEX(a) (CJ_BASE_INDEX(a, 3))
#if CLUSTER_M >= CLUSTER_N
# define CL_X_OFFSET (0 * CLUSTER_M)
# define CL_Y_OFFSET (1 * CLUSTER_M)
# define CL_Z_OFFSET (2 * CLUSTER_M)
#else
# define CL_X_OFFSET (0 * CLUSTER_N)
# define CL_Y_OFFSET (1 * CLUSTER_N)
# define CL_Z_OFFSET (2 * CLUSTER_N)
#endif
typedef struct {
int bin;
int natoms;
int type[CLUSTER_DIM_M];
MD_FLOAT bbminx, bbmaxx;
MD_FLOAT bbminy, bbmaxy;
MD_FLOAT bbminz, bbmaxz;
@ -44,9 +102,6 @@ typedef struct {
int Nclusters, Nclusters_local, Nclusters_ghost, Nclusters_max;
MD_FLOAT *x, *y, *z;
MD_FLOAT *vx, *vy, *vz;
MD_FLOAT *cl_x;
MD_FLOAT *cl_v;
MD_FLOAT *cl_f;
int *border_map;
int *type;
int ntypes;
@ -54,8 +109,15 @@ typedef struct {
MD_FLOAT *sigma6;
MD_FLOAT *cutforcesq;
MD_FLOAT *cutneighsq;
Cluster *clusters;
int *PBCx, *PBCy, *PBCz;
// Data in cluster format
MD_FLOAT *cl_x;
MD_FLOAT *cl_v;
MD_FLOAT *cl_f;
int *cl_type;
Cluster *iclusters, *jclusters;
int *icluster_bin;
int dummy_cj;
} Atom;
extern void initAtom(Atom*);
@ -67,10 +129,6 @@ extern int readAtom_dmp(Atom*, Parameter*);
extern void growAtom(Atom*);
extern void growClusters(Atom*);
#define cluster_pos_ptr(ci) &(atom->cl_x[(ci) * CLUSTER_DIM_M * 3])
#define cluster_velocity_ptr(ci) &(atom->cl_v[(ci) * CLUSTER_DIM_M * 3])
#define cluster_force_ptr(ci) &(atom->cl_f[(ci) * CLUSTER_DIM_M * 3])
#ifdef AOS
#define POS_DATA_LAYOUT "AoS"
#define atom_x(i) atom->x[(i) * 3 + 0]
@ -83,14 +141,4 @@ extern void growClusters(Atom*);
#define atom_z(i) atom->z[i]
#endif
#ifdef CLUSTER_AOS
#define cluster_x(cptr, i) cptr[(i) * 3 + 0]
#define cluster_y(cptr, i) cptr[(i) * 3 + 1]
#define cluster_z(cptr, i) cptr[(i) * 3 + 2]
#else
#define cluster_x(cptr, i) cptr[0 * CLUSTER_DIM_M + (i)]
#define cluster_y(cptr, i) cptr[1 * CLUSTER_DIM_M + (i)]
#define cluster_z(cptr, i) cptr[2 * CLUSTER_DIM_M + (i)]
#endif
#endif

View File

@ -40,6 +40,7 @@ extern void buildNeighbor(Atom*, Neighbor*);
extern void pruneNeighbor(Parameter*, Atom*, Neighbor*);
extern void sortAtom(Atom*);
extern void buildClusters(Atom*);
extern void defineJClusters(Atom*);
extern void binClusters(Atom*);
extern void updateSingleAtoms(Atom*);
#endif

View File

@ -21,6 +21,7 @@
* =======================================================================================
*/
#include <stdlib.h>
#include <string.h>
#include <immintrin.h>
#include <zmmintrin.h>
@ -28,7 +29,9 @@
#define SIMD_PRINT_REAL(a) simd_print_real(#a, a);
#define SIMD_PRINT_MASK(a) simd_print_mask(#a, a);
#if VECTOR_WIDTH == 8 // AVX512
#ifdef AVX512
#if PRECISION == 2 // Double precision
#define MD_SIMD_FLOAT __m512d
#define MD_SIMD_MASK __mmask8
@ -45,30 +48,100 @@ static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { retur
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ); }
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_pd(p); }
static MD_SIMD_FLOAT simd_load2(MD_FLOAT *c0, MD_FLOAT *c1, int d) {
MD_SIMD_FLOAT x;
#ifdef CLUSTER_AOS
__m256i aos_gather_vindex = _mm256_set_epi32(9, 6, 3, 0, 9, 6, 3, 0);
__m256i vindex = _mm256_add_epi32(aos_gather_vindex, _mm256_set1_epi32(d));
x = _mm512_mask_i32gather_pd(simd_zero(), simd_mask_from_u32(0x0f), vindex, c0, sizeof(double));
x = _mm512_mask_i32gather_pd(x, simd_mask_from_u32(0xf0), vindex, c1, sizeof(double));
#else
x = _mm512_load_pd(&c0[d * CLUSTER_DIM_M]);
x = _mm512_insertf64x4(x, _mm256_load_pd(&c1[d * CLUSTER_DIM_M]), 1);
#endif
return x;
}
static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
MD_SIMD_FLOAT x = _mm512_add_pd(a, _mm512_shuffle_f64x2(a, a, 0xee));
x = _mm512_add_pd(x, _mm512_shuffle_f64x2(x, x, 0x11));
x = _mm512_add_pd(x, _mm512_permute_pd(x, 0x01));
return *((MD_FLOAT *) &x);
}
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const double* m) {
return _mm512_broadcast_f64x4(_mm256_load_pd(m));
}
static inline MD_SIMD_FLOAT simd_load_h_dual(const double* m) {
return _mm512_insertf64x4(_mm512_broadcastsd_pd(_mm_load_sd(m)), _mm256_broadcastsd_pd(_mm_load_sd(m + 1)), 1);
}
static inline double simd_h_dual_reduce_sum(double* m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
__m512d t0;
__m256d t2, t3;
t0 = _mm512_add_pd(v0, _mm512_permutex_pd(v0, 0x4e));
t0 = _mm512_mask_add_pd(t0, simd_mask_from_u32(0xccul), v1, _mm512_permutex_pd(v1, 0x4e));
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1));
t0 = _mm512_mask_shuffle_f64x2(t0, simd_mask_from_u32(0xaaul), t0, t0, 0xee);
t2 = _mm512_castpd512_pd256(t0);
t3 = _mm256_load_pd(m);
t3 = _mm256_add_pd(t3, t2);
_mm256_store_pd(m, t3);
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0x4e));
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1));
return _mm_cvtsd_f64(_mm512_castpd512_pd128(t0));
}
#else // Single-precision
#define MD_SIMD_FLOAT __m512
#define MD_SIMD_MASK __mmask16
static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm512_set1_ps(scalar); }
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_ps(0.0f); }
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_ps(a, b); }
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_sub_ps(a, b); }
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_mul_ps(a, b); }
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm512_fmadd_ps(a, b, c); }
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm512_rcp14_ps(a); }
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm512_mask_add_ps(a, m, a, b); }
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask16(a, b); }
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ); }
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask16(a); }
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask16_u32(a); }
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_ps(p); }
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
// This would only be called in a Mx16 configuration, which is not valid in GROMACS
fprintf(stderr, "simd_h_reduce_sum(): Called with AVX512 intrinsics and single-precision which is not valid!\n");
exit(-1);
return 0.0;
}
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const float* m) {
return _mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_load_pd((const double *)(m))));
}
static inline MD_SIMD_FLOAT simd_load_h_dual(const float* m) {
return _mm512_shuffle_f32x4(_mm512_broadcastss_ps(_mm_load_ss(m)), _mm512_broadcastss_ps(_mm_load_ss(m + 1)), 0x44);
}
static inline MD_FLOAT simd_h_dual_reduce_sum(float* m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
__m512 t0, t1;
__m128 t2, t3;
t0 = _mm512_shuffle_f32x4(v0, v1, 0x88);
t1 = _mm512_shuffle_f32x4(v0, v1, 0xdd);
t0 = _mm512_add_ps(t0, t1);
t0 = _mm512_add_ps(t0, _mm512_permute_ps(t0, 0x4e));
t0 = _mm512_add_ps(t0, _mm512_permute_ps(t0, 0xb1));
t0 = _mm512_maskz_compress_ps(simd_mask_from_u32(0x1111ul), t0);
t3 = _mm512_castps512_ps128(t0);
t2 = _mm_load_ps(m);
t2 = _mm_add_ps(t2, t3);
_mm_store_ps(m, t2);
t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0x4e));
t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0xb1));
return _mm_cvtss_f32(t3);
}
#endif // PRECISION
#else // AVX or AVX2
#if PRECISION == 2 // Double precision
#define MD_SIMD_FLOAT __m256d
#ifdef NO_AVX2
@ -82,8 +155,10 @@ static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_pd(0.0); }
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_pd(a, b); }
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_pd(a, b); }
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_pd(a, b); }
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_pd(p); }
#ifdef NO_AVX2
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(a))); }
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return simd_add(simd_mul(a, b), c); }
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return simd_add(a, _mm256_and_pd(b, m)); }
@ -97,7 +172,7 @@ static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) {
}
// TODO: Implement this, althrough it is just required for debugging
static inline int simd_mask_to_u32(MD_SIMD_MASK a) { return 0; }
static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
__m128d a0, a1;
a = _mm256_add_pd(a, _mm256_permute_pd(a, 0b0101));
a0 = _mm256_castpd256_pd128(a);
@ -105,7 +180,9 @@ static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
a0 = _mm_add_sd(a0, a1);
return *((MD_FLOAT *) &a0);
}
#else
#else // AVX2
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_pd(a); }
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_pd(a, b, c); }
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_pd(a, m, a, b); }
@ -113,7 +190,7 @@ static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) {
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); }
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
__m128d a0, a1;
// test with shuffle & add as an alternative to hadd later
a = _mm256_hadd_pd(a, a);
@ -122,24 +199,52 @@ static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
a0 = _mm_add_sd(a0, a1);
return *((MD_FLOAT *) &a0);
}
#endif
static MD_SIMD_FLOAT simd_load(MD_FLOAT *c0, int d) {
MD_SIMD_FLOAT x;
#ifdef CLUSTER_AOS
#else // Single-precision
#define MD_SIMD_FLOAT __m256
#ifdef NO_AVX2
#error "Not possible to use AoS cluster layout without AVX2 support!"
#endif
__m128i aos_gather_vindex = _mm128_set_epi32(9, 6, 3, 0);
__m128i vindex = _mm128_add_epi32(aos_gather_vindex, _mm128_set1_epi32(d));
x = _mm256_i32gather_pd(c0, vindex, sizeof(double));
#define MD_SIMD_MASK __m256
#else
x = _mm256_load_pd(&c0[d * CLUSTER_DIM_M]);
#define MD_SIMD_MASK __mmask8
#endif
return x;
static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm256_set1_ps(scalar); }
static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_ps(0.0); }
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_ps(a, b); }
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_ps(a, b); }
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_ps(a, b); }
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_ps(p); }
#ifdef NO_AVX2
#error "AVX intrinsincs with single-precision not implemented!"
#else // AVX2
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_ps(a); }
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_ps(a, b, c); }
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_ps(a, m, a, b); }
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_pd_mask(a, b, _CMP_LT_OQ); }
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); }
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
__m128 t0;
t0 = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 0x1));
t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
return *((MD_FLOAT *) &t0);
}
#endif
#endif // NO_AVX2
#endif // PRECISION
#endif // AVX or AVX2
static inline void simd_print_real(const char *ref, MD_SIMD_FLOAT a) {
double x[VECTOR_WIDTH];

View File

@ -217,8 +217,6 @@ int main(int argc, const char *argv[]) {
if(!csv) {
printf("Number of timesteps: %d\n", param.ntimes);
printf("Number of times to compute the atoms loop: %d\n", ATOMS_LOOP_RUNS);
printf("Number of times to compute the neighbors loop: %d\n", NEIGHBORS_LOOP_RUNS);
printf("System size (unit cells): %dx%dx%d\n", param.nx, param.ny, param.nz);
printf("Atoms per unit cell: %d\n", atoms_per_unit_cell);
printf("Total number of atoms: %d\n", atom->Nlocal);
@ -257,9 +255,8 @@ int main(int argc, const char *argv[]) {
E = getTimeStamp();
double T_accum = E-S;
double freq_hz = param.proc_freq * 1.e9;
const double repeats = ATOMS_LOOP_RUNS * NEIGHBORS_LOOP_RUNS;
const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes * repeats);
const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes * repeats) * freq_hz;
const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes);
const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes) * freq_hz;
const double cycles_per_neigh = cycles_per_atom / (double)(atoms_per_unit_cell - 1);
if(!csv) {

View File

@ -43,14 +43,9 @@
extern double computeForceLJ_ref(Parameter*, Atom*, Neighbor*, Stats*);
extern double computeForceLJ_4xn(Parameter*, Atom*, Neighbor*, Stats*);
extern double computeForceLJ_2xnn(Parameter*, Atom*, Neighbor*, Stats*);
extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
#ifdef USE_REFERENCE_VERSION
# define computeForceLJ computeForceLJ_ref
#else
# define computeForceLJ computeForceLJ_4xn
#endif
double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) {
if(param->force_field == FF_EAM) { initEam(eam, param); }
double S, E;
@ -75,6 +70,7 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *
setupThermo(param, atom->Natoms);
if(param->input_file == NULL) { adjustThermo(param, atom); }
buildClusters(atom);
defineJClusters(atom);
setupPbc(atom, param);
binClusters(atom);
buildNeighbor(atom, neighbor);
@ -89,6 +85,7 @@ double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
updateSingleAtoms(atom);
updateAtomsPbc(atom, param);
buildClusters(atom);
defineJClusters(atom);
setupPbc(atom, param);
binClusters(atom);
buildNeighbor(atom, neighbor);
@ -101,17 +98,18 @@ void initialIntegrate(Parameter *param, Atom *atom) {
DEBUG_MESSAGE("initialIntegrate start\n");
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
MD_FLOAT *ciptr = cluster_pos_ptr(ci);
MD_FLOAT *civptr = cluster_velocity_ptr(ci);
MD_FLOAT *cifptr = cluster_force_ptr(ci);
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) {
cluster_x(civptr, cii) += param->dtforce * cluster_x(cifptr, cii);
cluster_y(civptr, cii) += param->dtforce * cluster_y(cifptr, cii);
cluster_z(civptr, cii) += param->dtforce * cluster_z(cifptr, cii);
cluster_x(ciptr, cii) += param->dt * cluster_x(civptr, cii);
cluster_y(ciptr, cii) += param->dt * cluster_y(civptr, cii);
cluster_z(ciptr, cii) += param->dt * cluster_z(civptr, cii);
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii];
ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii];
ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii];
ci_x[CL_X_OFFSET + cii] += param->dt * ci_v[CL_X_OFFSET + cii];
ci_x[CL_Y_OFFSET + cii] += param->dt * ci_v[CL_Y_OFFSET + cii];
ci_x[CL_Z_OFFSET + cii] += param->dt * ci_v[CL_Z_OFFSET + cii];
}
}
@ -122,13 +120,14 @@ void finalIntegrate(Parameter *param, Atom *atom) {
DEBUG_MESSAGE("finalIntegrate start\n");
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
MD_FLOAT *civptr = cluster_velocity_ptr(ci);
MD_FLOAT *cifptr = cluster_force_ptr(ci);
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) {
cluster_x(civptr, cii) += param->dtforce * cluster_x(cifptr, cii);
cluster_y(civptr, cii) += param->dtforce * cluster_y(cifptr, cii);
cluster_z(civptr, cii) += param->dtforce * cluster_z(cifptr, cii);
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii];
ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii];
ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii];
}
}
@ -316,6 +315,7 @@ int main(int argc, char** argv) {
}
timer[TOTAL] = getTimeStamp() - timer[TOTAL];
updateSingleAtoms(&atom);
computeThermo(-1, &param, &atom);
if(param.xtc_file != NULL) {
@ -323,6 +323,7 @@ int main(int argc, char** argv) {
}
printf(HLINE);
printf("Kernel: %s, MxN: %dx%d, Vector width: %d\n", KERNEL_NAME, CLUSTER_M, CLUSTER_N, VECTOR_WIDTH);
printf("Data layout for positions: %s\n", POS_DATA_LAYOUT);
#if PRECISION == 1
printf("Using single precision floating point.\n");

View File

@ -39,8 +39,8 @@ static int nbinx, nbiny;
static int mbinx, mbiny; // n bins in x, y
static int *bincount;
static int *bins;
static int *cluster_bincount;
static int *cluster_bins;
static int *bin_nclusters;
static int *bin_clusters;
static int mbins; //total number of bins
static int atoms_per_bin; // max atoms per bin
static int clusters_per_bin; // max clusters per bin
@ -63,12 +63,12 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) {
cutneigh = param->cutneigh;
nmax = 0;
atoms_per_bin = 8;
clusters_per_bin = (atoms_per_bin / CLUSTER_DIM_M) + 4;
clusters_per_bin = (atoms_per_bin / CLUSTER_M) + 10;
stencil = NULL;
bins = NULL;
bincount = NULL;
cluster_bins = NULL;
cluster_bincount = NULL;
bin_clusters = NULL;
bin_nclusters = NULL;
neighbor->maxneighs = 100;
neighbor->numneigh = NULL;
neighbor->neighbors = NULL;
@ -91,7 +91,7 @@ void setupNeighbor(Parameter *param, Atom *atom) {
MD_FLOAT zlo = 0.0; MD_FLOAT zhi = zprd;
MD_FLOAT atom_density = ((MD_FLOAT)(atom->Nlocal)) / ((xhi - xlo) * (yhi - ylo) * (zhi - zlo));
MD_FLOAT atoms_in_cell = MAX(CLUSTER_DIM_M, CLUSTER_DIM_N);
MD_FLOAT atoms_in_cell = MAX(CLUSTER_M, CLUSTER_N);
MD_FLOAT targetsizex = cbrt(atoms_in_cell / atom_density);
MD_FLOAT targetsizey = cbrt(atoms_in_cell / atom_density);
nbinx = MAX(1, (int)ceil((xhi - xlo) / targetsizex));
@ -104,17 +104,13 @@ void setupNeighbor(Parameter *param, Atom *atom) {
coord = xlo - cutneigh - SMALL * xprd;
mbinxlo = (int)(coord * bininvx);
if (coord < 0.0) {
mbinxlo = mbinxlo - 1;
}
if(coord < 0.0) { mbinxlo = mbinxlo - 1; }
coord = xhi + cutneigh + SMALL * xprd;
mbinxhi = (int)(coord * bininvx);
coord = ylo - cutneigh - SMALL * yprd;
mbinylo = (int)(coord * bininvy);
if (coord < 0.0) {
mbinylo = mbinylo - 1;
}
if(coord < 0.0) { mbinylo = mbinylo - 1; }
coord = yhi + cutneigh + SMALL * yprd;
mbinyhi = (int)(coord * bininvy);
@ -127,15 +123,11 @@ void setupNeighbor(Parameter *param, Atom *atom) {
mbiny = mbinyhi - mbinylo + 1;
nextx = (int)(cutneigh * bininvx);
if(nextx * binsizex < FACTOR * cutneigh) nextx++;
nexty = (int)(cutneigh * bininvy);
if(nextx * binsizex < FACTOR * cutneigh) nextx++;
if(nexty * binsizey < FACTOR * cutneigh) nexty++;
if (stencil) {
free(stencil);
}
if (stencil) { free(stencil); }
stencil = (int *) malloc((2 * nexty + 1) * (2 * nextx + 1) * sizeof(int));
nstencil = 0;
@ -147,19 +139,15 @@ void setupNeighbor(Parameter *param, Atom *atom) {
}
}
mbins = mbinx * mbiny;
if(bincount) { free(bincount); }
bincount = (int*) malloc(mbins * sizeof(int));
if(bins) { free(bins); }
if(bin_nclusters) { free(bin_nclusters); }
if(bin_clusters) { free(bin_clusters); }
mbins = mbinx * mbiny;
bincount = (int*) malloc(mbins * sizeof(int));
bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int));
if (cluster_bincount) { free(cluster_bincount); }
cluster_bincount = (int*) malloc(mbins * sizeof(int));
if (cluster_bins) { free(cluster_bins); }
cluster_bins = (int*) malloc(mbins * clusters_per_bin * sizeof(int));
bin_nclusters = (int*) malloc(mbins * sizeof(int));
bin_clusters = (int*) malloc(mbins * clusters_per_bin * sizeof(int));
/*
DEBUG_MESSAGE("lo, hi = (%e, %e, %e), (%e, %e, %e)\n", xlo, ylo, zlo, xhi, yhi, zhi);
@ -171,20 +159,20 @@ void setupNeighbor(Parameter *param, Atom *atom) {
}
MD_FLOAT getBoundingBoxDistanceSq(Atom *atom, int ci, int cj) {
MD_FLOAT dl = atom->clusters[ci].bbminx - atom->clusters[cj].bbmaxx;
MD_FLOAT dh = atom->clusters[cj].bbminx - atom->clusters[ci].bbmaxx;
MD_FLOAT dl = atom->iclusters[ci].bbminx - atom->jclusters[cj].bbmaxx;
MD_FLOAT dh = atom->jclusters[cj].bbminx - atom->iclusters[ci].bbmaxx;
MD_FLOAT dm = MAX(dl, dh);
MD_FLOAT dm0 = MAX(dm, 0.0);
MD_FLOAT d2 = dm0 * dm0;
dl = atom->clusters[ci].bbminy - atom->clusters[cj].bbmaxy;
dh = atom->clusters[cj].bbminy - atom->clusters[ci].bbmaxy;
dl = atom->iclusters[ci].bbminy - atom->jclusters[cj].bbmaxy;
dh = atom->jclusters[cj].bbminy - atom->iclusters[ci].bbmaxy;
dm = MAX(dl, dh);
dm0 = MAX(dm, 0.0);
d2 += dm0 * dm0;
dl = atom->clusters[ci].bbminz - atom->clusters[cj].bbmaxz;
dh = atom->clusters[cj].bbminz - atom->clusters[ci].bbmaxz;
dl = atom->iclusters[ci].bbminz - atom->jclusters[cj].bbmaxz;
dh = atom->jclusters[cj].bbminz - atom->iclusters[ci].bbmaxz;
dm = MAX(dl, dh);
dm0 = MAX(dm, 0.0);
d2 += dm0 * dm0;
@ -192,14 +180,16 @@ MD_FLOAT getBoundingBoxDistanceSq(Atom *atom, int ci, int cj) {
}
int atomDistanceInRange(Atom *atom, int ci, int cj, MD_FLOAT rsq) {
MD_FLOAT *ciptr = cluster_pos_ptr(ci);
MD_FLOAT *cjptr = cluster_pos_ptr(cj);
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) {
for(int cjj = 0; cjj < atom->clusters[cj].natoms; cjj++) {
MD_FLOAT delx = cluster_x(ciptr, cii) - cluster_x(cjptr, cjj);
MD_FLOAT dely = cluster_y(ciptr, cii) - cluster_y(cjptr, cjj);
MD_FLOAT delz = cluster_z(ciptr, cii) - cluster_z(cjptr, cjj);
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
MD_FLOAT delx = ci_x[CL_X_OFFSET + cii] - cj_x[CL_X_OFFSET + cjj];
MD_FLOAT dely = ci_x[CL_Y_OFFSET + cii] - cj_x[CL_Y_OFFSET + cjj];
MD_FLOAT delz = ci_x[CL_Z_OFFSET + cii] - cj_x[CL_Z_OFFSET + cjj];
if(delx * delx + dely * dely + delz * delz < rsq) {
return 1;
}
@ -211,11 +201,10 @@ int atomDistanceInRange(Atom *atom, int ci, int cj, MD_FLOAT rsq) {
void buildNeighbor(Atom *atom, Neighbor *neighbor) {
DEBUG_MESSAGE("buildNeighbor start\n");
int nall = atom->Nclusters_local + atom->Nclusters_ghost;
/* extend atom arrays if necessary */
if(nall > nmax) {
nmax = nall;
if(atom->Nclusters_local > nmax) {
nmax = atom->Nclusters_local;
if(neighbor->numneigh) free(neighbor->numneigh);
if(neighbor->neighbors) free(neighbor->neighbors);
neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
@ -236,20 +225,20 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
int n = 0;
int ibin = atom->clusters[ci].bin;
MD_FLOAT ibb_xmin = atom->clusters[ci].bbminx;
MD_FLOAT ibb_xmax = atom->clusters[ci].bbmaxx;
MD_FLOAT ibb_ymin = atom->clusters[ci].bbminy;
MD_FLOAT ibb_ymax = atom->clusters[ci].bbmaxy;
MD_FLOAT ibb_zmin = atom->clusters[ci].bbminz;
MD_FLOAT ibb_zmax = atom->clusters[ci].bbmaxz;
int ibin = atom->icluster_bin[ci];
MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx;
MD_FLOAT ibb_xmax = atom->iclusters[ci].bbmaxx;
MD_FLOAT ibb_ymin = atom->iclusters[ci].bbminy;
MD_FLOAT ibb_ymax = atom->iclusters[ci].bbmaxy;
MD_FLOAT ibb_zmin = atom->iclusters[ci].bbminz;
MD_FLOAT ibb_zmax = atom->iclusters[ci].bbmaxz;
for(int k = 0; k < nstencil; k++) {
int jbin = ibin + stencil[k];
int *loc_bin = &cluster_bins[jbin * clusters_per_bin];
int *loc_bin = &bin_clusters[jbin * clusters_per_bin];
int cj, m = -1;
MD_FLOAT jbb_xmin, jbb_xmax, jbb_ymin, jbb_ymax, jbb_zmin, jbb_zmax;
const int c = cluster_bincount[jbin];
const int c = bin_nclusters[jbin];
if(c > 0) {
MD_FLOAT dl, dh, dm, dm0, d_bb_sq;
@ -257,8 +246,8 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
do {
m++;
cj = loc_bin[m];
jbb_zmin = atom->clusters[cj].bbminz;
jbb_zmax = atom->clusters[cj].bbmaxz;
jbb_zmin = atom->jclusters[cj].bbminz;
jbb_zmax = atom->jclusters[cj].bbmaxz;
dl = ibb_zmin - jbb_zmax;
dh = jbb_zmin - ibb_zmax;
dm = MAX(dl, dh);
@ -266,10 +255,10 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
d_bb_sq = dm0 * dm0;
} while(m + 1 < c && d_bb_sq > cutneighsq);
jbb_xmin = atom->clusters[cj].bbminx;
jbb_xmax = atom->clusters[cj].bbmaxx;
jbb_ymin = atom->clusters[cj].bbminy;
jbb_ymax = atom->clusters[cj].bbmaxy;
jbb_xmin = atom->jclusters[cj].bbminx;
jbb_xmax = atom->jclusters[cj].bbmaxx;
jbb_ymin = atom->jclusters[cj].bbminy;
jbb_ymax = atom->jclusters[cj].bbmaxy;
while(m < c) {
dl = ibb_zmin - jbb_zmax;
@ -303,20 +292,21 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
m++;
if(m < c) {
cj = loc_bin[m];
jbb_xmin = atom->clusters[cj].bbminx;
jbb_xmax = atom->clusters[cj].bbmaxx;
jbb_ymin = atom->clusters[cj].bbminy;
jbb_ymax = atom->clusters[cj].bbmaxy;
jbb_zmin = atom->clusters[cj].bbminz;
jbb_zmax = atom->clusters[cj].bbmaxz;
jbb_xmin = atom->jclusters[cj].bbminx;
jbb_xmax = atom->jclusters[cj].bbmaxx;
jbb_ymin = atom->jclusters[cj].bbminy;
jbb_ymax = atom->jclusters[cj].bbmaxy;
jbb_zmin = atom->jclusters[cj].bbminz;
jbb_zmax = atom->jclusters[cj].bbmaxz;
}
}
}
}
if(CLUSTER_DIM_N > CLUSTER_DIM_M) {
while(n % (CLUSTER_DIM_N / CLUSTER_DIM_M)) {
neighptr[n++] = nall - 1; // Last cluster is always a dummy cluster
// Fill neighbor list with dummy values to fit vector width
if(CLUSTER_N < VECTOR_WIDTH) {
while(n % (VECTOR_WIDTH / CLUSTER_N)) {
neighptr[n++] = atom->dummy_cj; // Last cluster is always a dummy cluster
}
}
@ -341,38 +331,40 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
/*
DEBUG_MESSAGE("\ncutneighsq = %f, rbb_sq = %f\n", cutneighsq, rbb_sq);
for(int ci = 0; ci < 6; ci++) {
MD_FLOAT *ciptr = cluster_pos_ptr(ci);
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
int* neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
DEBUG_MESSAGE("Cluster %d, bbx = {%f, %f}, bby = {%f, %f}, bbz = {%f, %f}\n",
ci,
atom->clusters[ci].bbminx,
atom->clusters[ci].bbmaxx,
atom->clusters[ci].bbminy,
atom->clusters[ci].bbmaxy,
atom->clusters[ci].bbminz,
atom->clusters[ci].bbmaxz);
atom->iclusters[ci].bbminx,
atom->iclusters[ci].bbmaxx,
atom->iclusters[ci].bbminy,
atom->iclusters[ci].bbmaxy,
atom->iclusters[ci].bbminz,
atom->iclusters[ci].bbmaxz);
for(int cii = 0; cii < CLUSTER_DIM_M; cii++) {
DEBUG_MESSAGE("%f, %f, %f\n", cluster_x(ciptr, cii), cluster_y(ciptr, cii), cluster_z(ciptr, cii));
for(int cii = 0; cii < CLUSTER_M; cii++) {
DEBUG_MESSAGE("%f, %f, %f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
}
DEBUG_MESSAGE("Neighbors:\n");
for(int k = 0; k < neighbor->numneigh[ci]; k++) {
const int cj = neighptr[k];
MD_FLOAT *cjptr = cluster_pos_ptr(cj);
int cj = neighptr[k];
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
DEBUG_MESSAGE(" Cluster %d, bbx = {%f, %f}, bby = {%f, %f}, bbz = {%f, %f}\n",
cj,
atom->clusters[cj].bbminx,
atom->clusters[cj].bbmaxx,
atom->clusters[cj].bbminy,
atom->clusters[cj].bbmaxy,
atom->clusters[cj].bbminz,
atom->clusters[cj].bbmaxz);
atom->jclusters[cj].bbminx,
atom->jclusters[cj].bbmaxx,
atom->jclusters[cj].bbminy,
atom->jclusters[cj].bbmaxy,
atom->jclusters[cj].bbminz,
atom->jclusters[cj].bbmaxz);
for(int cjj = 0; cjj < CLUSTER_DIM_M; cjj++) {
DEBUG_MESSAGE(" %f, %f, %f\n", cluster_x(cjptr, cjj), cluster_y(cjptr, cjj), cluster_z(cjptr, cjj));
for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
DEBUG_MESSAGE(" %f, %f, %f\n", cj_x[CL_X_OFFSET + cjj], cj_x[CL_Y_OFFSET + cjj], cj_x[CL_Z_OFFSET + cjj]);
}
}
}
@ -383,7 +375,6 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
DEBUG_MESSAGE("pruneNeighbor start\n");
int nall = atom->Nclusters_local + atom->Nclusters_ghost;
//MD_FLOAT cutsq = param->cutforce * param->cutforce;
MD_FLOAT cutsq = cutneighsq;
@ -393,8 +384,8 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
int k = 0;
// Remove dummy clusters if necessary
if(CLUSTER_DIM_N > CLUSTER_DIM_M) {
while(neighs[numneighs - 1] == nall - 1) {
if(CLUSTER_N < VECTOR_WIDTH) {
while(neighs[numneighs - 1] == atom->dummy_cj) {
numneighs--;
}
}
@ -410,9 +401,9 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
}
// Readd dummy clusters if necessary
if(CLUSTER_DIM_N > CLUSTER_DIM_M) {
while(numneighs % (CLUSTER_DIM_N / CLUSTER_DIM_M)) {
neighs[numneighs++] = nall - 1; // Last cluster is always a dummy cluster
if(CLUSTER_N < VECTOR_WIDTH) {
while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) {
neighs[numneighs++] = atom->dummy_cj; // Last cluster is always a dummy cluster
}
}
@ -558,34 +549,37 @@ void buildClusters(Atom *atom) {
for(int bin = 0; bin < mbins; bin++) {
int c = bincount[bin];
int ac = 0;
const int nclusters = ((c + CLUSTER_DIM_M - 1) / CLUSTER_DIM_M);
int nclusters = ((c + CLUSTER_M - 1) / CLUSTER_M);
if(CLUSTER_N > CLUSTER_M && nclusters % 2) { nclusters++; }
for(int cl = 0; cl < nclusters; cl++) {
const int ci = atom->Nclusters_local;
if(ci >= atom->Nclusters_max) {
growClusters(atom);
}
MD_FLOAT *cptr = cluster_pos_ptr(ci);
MD_FLOAT *cvptr = cluster_velocity_ptr(ci);
int ci_sca_base = CI_SCALAR_BASE_INDEX(ci);
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
int *ci_type = &atom->cl_type[ci_sca_base];
MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
atom->clusters[ci].natoms = 0;
for(int cii = 0; cii < CLUSTER_DIM_M; cii++) {
atom->iclusters[ci].natoms = 0;
for(int cii = 0; cii < CLUSTER_M; cii++) {
if(ac < c) {
int i = bins[bin * atoms_per_bin + ac];
MD_FLOAT xtmp = atom_x(i);
MD_FLOAT ytmp = atom_y(i);
MD_FLOAT ztmp = atom_z(i);
cluster_x(cptr, cii) = xtmp;
cluster_y(cptr, cii) = ytmp;
cluster_z(cptr, cii) = ztmp;
cluster_x(cvptr, cii) = atom->vx[i];
cluster_y(cvptr, cii) = atom->vy[i];
cluster_z(cvptr, cii) = atom->vz[i];
ci_x[CL_X_OFFSET + cii] = xtmp;
ci_x[CL_Y_OFFSET + cii] = ytmp;
ci_x[CL_Z_OFFSET + cii] = ztmp;
ci_v[CL_X_OFFSET + cii] = atom->vx[i];
ci_v[CL_Y_OFFSET + cii] = atom->vy[i];
ci_v[CL_Z_OFFSET + cii] = atom->vz[i];
// TODO: To create the bounding boxes faster, we can use SIMD operations
if(bbminx > xtmp) { bbminx = xtmp; }
@ -595,24 +589,24 @@ void buildClusters(Atom *atom) {
if(bbminz > ztmp) { bbminz = ztmp; }
if(bbmaxz < ztmp) { bbmaxz = ztmp; }
atom->clusters[ci].type[cii] = atom->type[i];
atom->clusters[ci].natoms++;
ci_type[cii] = atom->type[i];
atom->iclusters[ci].natoms++;
} else {
cluster_x(cptr, cii) = INFINITY;
cluster_y(cptr, cii) = INFINITY;
cluster_z(cptr, cii) = INFINITY;
ci_x[CL_X_OFFSET + cii] = INFINITY;
ci_x[CL_Y_OFFSET + cii] = INFINITY;
ci_x[CL_Z_OFFSET + cii] = INFINITY;
}
ac++;
}
atom->clusters[ci].bin = bin;
atom->clusters[ci].bbminx = bbminx;
atom->clusters[ci].bbmaxx = bbmaxx;
atom->clusters[ci].bbminy = bbminy;
atom->clusters[ci].bbmaxy = bbmaxy;
atom->clusters[ci].bbminz = bbminz;
atom->clusters[ci].bbmaxz = bbmaxz;
atom->icluster_bin[ci] = bin;
atom->iclusters[ci].bbminx = bbminx;
atom->iclusters[ci].bbmaxx = bbmaxx;
atom->iclusters[ci].bbminy = bbminy;
atom->iclusters[ci].bbmaxy = bbmaxy;
atom->iclusters[ci].bbminz = bbminz;
atom->iclusters[ci].bbmaxz = bbmaxz;
atom->Nclusters_local++;
}
}
@ -620,6 +614,94 @@ void buildClusters(Atom *atom) {
DEBUG_MESSAGE("buildClusters end\n");
}
void defineJClusters(Atom *atom) {
DEBUG_MESSAGE("defineJClusters start\n");
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
int cj0 = CJ0_FROM_CI(ci);
if(CLUSTER_M == CLUSTER_N) {
atom->jclusters[cj0].bbminx = atom->iclusters[ci].bbminx;
atom->jclusters[cj0].bbmaxx = atom->iclusters[ci].bbmaxx;
atom->jclusters[cj0].bbminy = atom->iclusters[ci].bbminy;
atom->jclusters[cj0].bbmaxy = atom->iclusters[ci].bbmaxy;
atom->jclusters[cj0].bbminz = atom->iclusters[ci].bbminz;
atom->jclusters[cj0].bbmaxz = atom->iclusters[ci].bbmaxz;
atom->jclusters[cj0].natoms = atom->iclusters[ci].natoms;
} else if(CLUSTER_M > CLUSTER_N) {
int cj1 = CJ1_FROM_CI(ci);
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
for(int cii = 0; cii < MAX(atom->iclusters[ci].natoms, CLUSTER_N); cii++) {
MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii];
MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii];
MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii];
// TODO: To create the bounding boxes faster, we can use SIMD operations
if(bbminx > xtmp) { bbminx = xtmp; }
if(bbmaxx < xtmp) { bbmaxx = xtmp; }
if(bbminy > ytmp) { bbminy = ytmp; }
if(bbmaxy < ytmp) { bbmaxy = ytmp; }
if(bbminz > ztmp) { bbminz = ztmp; }
if(bbmaxz < ztmp) { bbmaxz = ztmp; }
}
atom->jclusters[cj0].bbminx = bbminx;
atom->jclusters[cj0].bbmaxx = bbmaxx;
atom->jclusters[cj0].bbminy = bbminy;
atom->jclusters[cj0].bbmaxy = bbmaxy;
atom->jclusters[cj0].bbminz = bbminz;
atom->jclusters[cj0].bbmaxz = bbmaxz;
atom->jclusters[cj0].natoms = MAX(atom->iclusters[ci].natoms, CLUSTER_N);
bbminx = INFINITY, bbmaxx = -INFINITY;
bbminy = INFINITY, bbmaxy = -INFINITY;
bbminz = INFINITY, bbmaxz = -INFINITY;
for(int cii = CLUSTER_N; cii < atom->iclusters[ci].natoms; cii++) {
MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii];
MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii];
MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii];
// TODO: To create the bounding boxes faster, we can use SIMD operations
if(bbminx > xtmp) { bbminx = xtmp; }
if(bbmaxx < xtmp) { bbmaxx = xtmp; }
if(bbminy > ytmp) { bbminy = ytmp; }
if(bbmaxy < ytmp) { bbmaxy = ytmp; }
if(bbminz > ztmp) { bbminz = ztmp; }
if(bbmaxz < ztmp) { bbmaxz = ztmp; }
}
atom->jclusters[cj1].bbminx = bbminx;
atom->jclusters[cj1].bbmaxx = bbmaxx;
atom->jclusters[cj1].bbminy = bbminy;
atom->jclusters[cj1].bbmaxy = bbmaxy;
atom->jclusters[cj1].bbminz = bbminz;
atom->jclusters[cj1].bbmaxz = bbmaxz;
atom->jclusters[cj1].natoms = MIN(0, atom->iclusters[ci].natoms - CLUSTER_N);
} else {
if(ci % 2 == 0) {
const int ci1 = ci + 1;
atom->jclusters[cj0].bbminx = MIN(atom->iclusters[ci].bbminx, atom->iclusters[ci1].bbminx);
atom->jclusters[cj0].bbmaxx = MAX(atom->iclusters[ci].bbmaxx, atom->iclusters[ci1].bbmaxx);
atom->jclusters[cj0].bbminy = MIN(atom->iclusters[ci].bbminy, atom->iclusters[ci1].bbminy);
atom->jclusters[cj0].bbmaxy = MAX(atom->iclusters[ci].bbmaxy, atom->iclusters[ci1].bbmaxy);
atom->jclusters[cj0].bbminz = MIN(atom->iclusters[ci].bbminz, atom->iclusters[ci1].bbminz);
atom->jclusters[cj0].bbmaxz = MAX(atom->iclusters[ci].bbmaxz, atom->iclusters[ci1].bbmaxz);
atom->jclusters[cj0].natoms = atom->iclusters[ci].natoms + atom->iclusters[ci1].natoms;
}
}
}
DEBUG_MESSAGE("defineJClusters end\n");
}
void binClusters(Atom *atom) {
DEBUG_MESSAGE("binClusters start\n");
@ -629,7 +711,7 @@ void binClusters(Atom *atom) {
MD_FLOAT *cptr = cluster_pos_ptr(ci);
DEBUG_MESSAGE("Cluster %d:\n", ci);
DEBUG_MESSAGE("bin=%d, Natoms=%d, bbox={%f,%f},{%f,%f},{%f,%f}\n",
atom->clusters[ci].bin,
atom->icluster_bin[ci],
atom->clusters[ci].natoms,
atom->clusters[ci].bbminx,
atom->clusters[ci].bbmaxx,
@ -638,49 +720,65 @@ void binClusters(Atom *atom) {
atom->clusters[ci].bbminz,
atom->clusters[ci].bbmaxz);
for(int cii = 0; cii < CLUSTER_DIM_M; cii++) {
for(int cii = 0; cii < CLUSTER_M; cii++) {
DEBUG_MESSAGE("%f, %f, %f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii));
}
}
*/
const int nlocal = atom->Nclusters_local;
int resize = 1;
const int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
const int ncj = atom->Nclusters_local / jfac;
int resize = 1;
while(resize > 0) {
resize = 0;
for(int bin = 0; bin < mbins; bin++) {
cluster_bincount[bin] = 0;
bin_nclusters[bin] = 0;
}
for(int ci = 0; ci < nlocal && !resize; ci++) {
int bin = atom->clusters[ci].bin;
int c = cluster_bincount[bin];
if(c < clusters_per_bin) {
cluster_bins[bin * clusters_per_bin + c] = ci;
cluster_bincount[bin]++;
// Assure we add this j-cluster only once in the bin
if(!(CLUSTER_M < CLUSTER_N && ci % 2)) {
int bin = atom->icluster_bin[ci];
int c = bin_nclusters[bin];
if(c + 1 < clusters_per_bin) {
bin_clusters[bin * clusters_per_bin + c] = CJ0_FROM_CI(ci);
bin_nclusters[bin]++;
if(CLUSTER_M > CLUSTER_N) {
int cj1 = CJ1_FROM_CI(ci);
if(atom->jclusters[cj1].natoms > 0) {
bin_clusters[bin * clusters_per_bin + c + 1] = cj1;
bin_nclusters[bin]++;
}
}
} else {
resize = 1;
}
}
}
for(int cg = 0; cg < atom->Nclusters_ghost && !resize; cg++) {
const int ci = nlocal + cg;
MD_FLOAT *cptr = cluster_pos_ptr(ci);
MD_FLOAT ci_minz = atom->clusters[ci].bbminz;
MD_FLOAT xtmp, ytmp;
const int cj = ncj + cg;
int ix = -1, iy = -1;
MD_FLOAT xtmp, ytmp;
xtmp = cluster_x(cptr, 0);
ytmp = cluster_y(cptr, 0);
if(atom->jclusters[cj].natoms > 0) {
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
MD_FLOAT cj_minz = atom->jclusters[cj].bbminz;
xtmp = cj_x[CL_X_OFFSET + 0];
ytmp = cj_x[CL_Y_OFFSET + 0];
coord2bin2D(xtmp, ytmp, &ix, &iy);
ix = MAX(MIN(ix, mbinx - 1), 0);
iy = MAX(MIN(iy, mbiny - 1), 0);
for(int cii = 1; cii < atom->clusters[ci].natoms; cii++) {
for(int cjj = 1; cjj < atom->jclusters[cj].natoms; cjj++) {
int nix, niy;
xtmp = cluster_x(cptr, cii);
ytmp = cluster_y(cptr, cii);
xtmp = cj_x[CL_X_OFFSET + cjj];
ytmp = cj_x[CL_Y_OFFSET + cjj];
coord2bin2D(xtmp, ytmp, &nix, &niy);
nix = MAX(MIN(nix, mbinx - 1), 0);
niy = MAX(MIN(niy, mbiny - 1), 0);
@ -694,19 +792,19 @@ void binClusters(Atom *atom) {
}
int bin = iy * mbinx + ix + 1;
int c = cluster_bincount[bin];
int c = bin_nclusters[bin];
if(c < clusters_per_bin) {
// Insert the current ghost cluster in the bin keeping clusters
// sorted by z coordinate
int inserted = 0;
for(int i = 0; i < c; i++) {
int last_cl = cluster_bins[bin * clusters_per_bin + i];
if(atom->clusters[last_cl].bbminz > ci_minz) {
cluster_bins[bin * clusters_per_bin + i] = ci;
int last_cl = bin_clusters[bin * clusters_per_bin + i];
if(atom->jclusters[last_cl].bbminz > cj_minz) {
bin_clusters[bin * clusters_per_bin + i] = cj;
for(int j = i + 1; j <= c; j++) {
int tmp = cluster_bins[bin * clusters_per_bin + j];
cluster_bins[bin * clusters_per_bin + j] = last_cl;
int tmp = bin_clusters[bin * clusters_per_bin + j];
bin_clusters[bin * clusters_per_bin + j] = last_cl;
last_cl = tmp;
}
@ -716,26 +814,26 @@ void binClusters(Atom *atom) {
}
if(!inserted) {
cluster_bins[bin * clusters_per_bin + c] = ci;
bin_clusters[bin * clusters_per_bin + c] = cj;
}
atom->clusters[ci].bin = bin;
cluster_bincount[bin]++;
bin_nclusters[bin]++;
} else {
resize = 1;
}
}
}
if(resize) {
free(cluster_bins);
free(bin_clusters);
clusters_per_bin *= 2;
cluster_bins = (int*) malloc(mbins * clusters_per_bin * sizeof(int));
bin_clusters = (int*) malloc(mbins * clusters_per_bin * sizeof(int));
}
}
/*
DEBUG_MESSAGE("cluster_bincount\n");
for(int i = 0; i < mbins; i++) { DEBUG_MESSAGE("%d, ", cluster_bincount[i]); }
DEBUG_MESSAGE("bin_nclusters\n");
for(int i = 0; i < mbins; i++) { DEBUG_MESSAGE("%d, ", bin_nclusters[i]); }
DEBUG_MESSAGE("\n");
*/
@ -747,16 +845,17 @@ void updateSingleAtoms(Atom *atom) {
int Natom = 0;
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
MD_FLOAT *cptr = cluster_pos_ptr(ci);
MD_FLOAT *cvptr = cluster_velocity_ptr(ci);
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) {
atom_x(Natom) = cluster_x(cptr, cii);
atom_y(Natom) = cluster_y(cptr, cii);
atom_z(Natom) = cluster_z(cptr, cii);
atom->vx[Natom] = cluster_x(cvptr, cii);
atom->vy[Natom] = cluster_y(cvptr, cii);
atom->vz[Natom] = cluster_z(cvptr, cii);
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
atom_x(Natom) = ci_x[CL_X_OFFSET + cii];
atom_y(Natom) = ci_x[CL_Y_OFFSET + cii];
atom_z(Natom) = ci_x[CL_Z_OFFSET + cii];
atom->vx[Natom] = ci_v[CL_X_OFFSET + cii];
atom->vy[Natom] = ci_v[CL_Y_OFFSET + cii];
atom->vz[Natom] = ci_v[CL_Z_OFFSET + cii];
Natom++;
}
}

View File

@ -28,6 +28,7 @@
#include <atom.h>
#include <allocate.h>
#include <neighbor.h>
#include <util.h>
#define DELTA 20000
@ -45,28 +46,31 @@ void initPbc(Atom* atom) {
/* update coordinates of ghost atoms */
/* uses mapping created in setupPbc */
void updatePbc(Atom *atom, Parameter *param, int firstUpdate) {
int *border_map = atom->border_map;
int nlocal = atom->Nclusters_local;
DEBUG_MESSAGE("updatePbc start\n");
int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
int ncj = atom->Nclusters_local / jfac;
MD_FLOAT xprd = param->xprd;
MD_FLOAT yprd = param->yprd;
MD_FLOAT zprd = param->zprd;
for(int cg = 0; cg < atom->Nclusters_ghost; cg++) {
const int ci = nlocal + cg;
MD_FLOAT *cptr = cluster_pos_ptr(ci);
MD_FLOAT *bmap_cptr = cluster_pos_ptr(border_map[cg]);
const int cj = ncj + cg;
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
int bmap_vec_base = CJ_VECTOR_BASE_INDEX(atom->border_map[cg]);
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
MD_FLOAT *bmap_x = &atom->cl_x[bmap_vec_base];
MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) {
MD_FLOAT xtmp = cluster_x(bmap_cptr, cii) + atom->PBCx[cg] * xprd;
MD_FLOAT ytmp = cluster_y(bmap_cptr, cii) + atom->PBCy[cg] * yprd;
MD_FLOAT ztmp = cluster_z(bmap_cptr, cii) + atom->PBCz[cg] * zprd;
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
MD_FLOAT xtmp = bmap_x[CL_X_OFFSET + cjj] + atom->PBCx[cg] * xprd;
MD_FLOAT ytmp = bmap_x[CL_Y_OFFSET + cjj] + atom->PBCy[cg] * yprd;
MD_FLOAT ztmp = bmap_x[CL_Z_OFFSET + cjj] + atom->PBCz[cg] * zprd;
cluster_x(cptr, cii) = xtmp;
cluster_y(cptr, cii) = ytmp;
cluster_z(cptr, cii) = ztmp;
cj_x[CL_X_OFFSET + cjj] = xtmp;
cj_x[CL_Y_OFFSET + cjj] = ytmp;
cj_x[CL_Z_OFFSET + cjj] = ztmp;
if(firstUpdate) {
// TODO: To create the bounding boxes faster, we can use SIMD operations
@ -80,20 +84,22 @@ void updatePbc(Atom *atom, Parameter *param, int firstUpdate) {
}
if(firstUpdate) {
for(int cii = atom->clusters[ci].natoms; cii < CLUSTER_DIM_M; cii++) {
cluster_x(cptr, cii) = INFINITY;
cluster_y(cptr, cii) = INFINITY;
cluster_z(cptr, cii) = INFINITY;
for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) {
cj_x[CL_X_OFFSET + cjj] = INFINITY;
cj_x[CL_Y_OFFSET + cjj] = INFINITY;
cj_x[CL_Z_OFFSET + cjj] = INFINITY;
}
atom->clusters[ci].bbminx = bbminx;
atom->clusters[ci].bbmaxx = bbmaxx;
atom->clusters[ci].bbminy = bbminy;
atom->clusters[ci].bbmaxy = bbmaxy;
atom->clusters[ci].bbminz = bbminz;
atom->clusters[ci].bbmaxz = bbmaxz;
atom->jclusters[cj].bbminx = bbminx;
atom->jclusters[cj].bbmaxx = bbmaxx;
atom->jclusters[cj].bbminy = bbminy;
atom->jclusters[cj].bbmaxy = bbmaxy;
atom->jclusters[cj].bbminz = bbminz;
atom->jclusters[cj].bbmaxz = bbmaxz;
}
}
DEBUG_MESSAGE("updatePbc end\n");
}
/* relocate atoms that have left domain according
@ -130,15 +136,18 @@ void updateAtomsPbc(Atom *atom, Parameter *param) {
* that are then enforced in updatePbc */
#define ADDGHOST(dx,dy,dz); \
Nghost++; \
const int g_atom_idx = atom->Nclusters_local + Nghost; \
border_map[Nghost] = ci; \
const int cg = ncj + Nghost; \
const int cj_natoms = atom->jclusters[cj].natoms; \
atom->border_map[Nghost] = cj; \
atom->PBCx[Nghost] = dx; \
atom->PBCy[Nghost] = dy; \
atom->PBCz[Nghost] = dz; \
atom->clusters[g_atom_idx].natoms = atom->clusters[ci].natoms; \
Nghost_atoms += atom->clusters[g_atom_idx].natoms; \
for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) { \
atom->clusters[g_atom_idx].type[cii] = atom->clusters[ci].type[cii]; \
atom->jclusters[cg].natoms = cj_natoms; \
Nghost_atoms += cj_natoms; \
int cj_sca_base = CJ_SCALAR_BASE_INDEX(cj); \
int cg_sca_base = CJ_SCALAR_BASE_INDEX(cg); \
for(int cjj = 0; cjj < cj_natoms; cjj++) { \
atom->cl_type[cg_sca_base + cjj] = atom->cl_type[cj_sca_base + cjj]; \
}
/* internal subroutines */
@ -153,30 +162,32 @@ void growPbc(Atom* atom) {
}
void setupPbc(Atom *atom, Parameter *param) {
int *border_map = atom->border_map;
DEBUG_MESSAGE("setupPbc start\n");
MD_FLOAT xprd = param->xprd;
MD_FLOAT yprd = param->yprd;
MD_FLOAT zprd = param->zprd;
MD_FLOAT Cutneigh = param->cutneigh;
int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
int ncj = atom->Nclusters_local / jfac;
int Nghost = -1;
int Nghost_atoms = 0;
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
if (atom->Nclusters_local + Nghost + 7 >= atom->Nclusters_max) {
for(int cj = 0; cj < ncj; cj++) {
if(atom->jclusters[cj].natoms > 0) {
if(atom->Nclusters_local + (Nghost + 7) * jfac >= atom->Nclusters_max) {
growClusters(atom);
}
if (Nghost + 7 >= NmaxGhost) {
if((Nghost + 7) * jfac >= NmaxGhost) {
growPbc(atom);
border_map = atom->border_map;
}
MD_FLOAT bbminx = atom->clusters[ci].bbminx;
MD_FLOAT bbmaxx = atom->clusters[ci].bbmaxx;
MD_FLOAT bbminy = atom->clusters[ci].bbminy;
MD_FLOAT bbmaxy = atom->clusters[ci].bbmaxy;
MD_FLOAT bbminz = atom->clusters[ci].bbminz;
MD_FLOAT bbmaxz = atom->clusters[ci].bbmaxz;
MD_FLOAT bbminx = atom->jclusters[cj].bbminx;
MD_FLOAT bbmaxx = atom->jclusters[cj].bbmaxx;
MD_FLOAT bbminy = atom->jclusters[cj].bbminy;
MD_FLOAT bbmaxy = atom->jclusters[cj].bbmaxy;
MD_FLOAT bbminz = atom->jclusters[cj].bbminz;
MD_FLOAT bbmaxz = atom->jclusters[cj].bbmaxz;
/* Setup ghost atoms */
/* 6 planes */
@ -209,24 +220,28 @@ void setupPbc(Atom *atom, Parameter *param) {
if (bbmaxy >= (yprd-Cutneigh) && bbminx < Cutneigh) { ADDGHOST(+1,-1,0); }
if (bbmaxy >= (yprd-Cutneigh) && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); }
}
}
if(atom->Nclusters_local + Nghost + 1 >= atom->Nclusters_max) {
if(ncj + (Nghost + 1) * jfac >= atom->Nclusters_max) {
growClusters(atom);
}
// Add dummy cluster at the end
MD_FLOAT *cptr = cluster_pos_ptr(atom->Nclusters_local + Nghost + 1);
for(int cii = 0; cii < CLUSTER_DIM_M; cii++) {
cluster_x(cptr, cii) = INFINITY;
cluster_y(cptr, cii) = INFINITY;
cluster_z(cptr, cii) = INFINITY;
int cj_vec_base = CJ_VECTOR_BASE_INDEX(ncj + Nghost + 1);
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
cj_x[CL_X_OFFSET + cjj] = INFINITY;
cj_x[CL_Y_OFFSET + cjj] = INFINITY;
cj_x[CL_Z_OFFSET + cjj] = INFINITY;
}
// increase by one to make it the ghost atom count
atom->dummy_cj = ncj + Nghost + 1;
atom->Nghost = Nghost_atoms;
atom->Nclusters_ghost = Nghost + 1;
atom->Nclusters = atom->Nclusters_local + Nghost + 1;
// Update created ghost clusters positions
updatePbc(atom, param, 1);
DEBUG_MESSAGE("setupPbc end\n");
}

View File

@ -18,11 +18,11 @@ void initStats(Stats *s) {
void displayStatistics(Atom *atom, Parameter *param, Stats *stats, double *timer) {
#ifdef COMPUTE_STATS
const int MxN = CLUSTER_DIM_M * CLUSTER_DIM_N;
const int MxN = CLUSTER_M * CLUSTER_N;
double avg_atoms_cluster = (double)(atom->Nlocal) / (double)(atom->Nclusters_local);
double force_useful_volume = 1e-9 * ( (double)(atom->Nlocal * (param->ntimes + 1)) * (sizeof(MD_FLOAT) * 6 + sizeof(int)) +
(double)(stats->num_neighs) * (sizeof(MD_FLOAT) * 3 + sizeof(int)) );
double avg_neigh_atom = (stats->num_neighs * CLUSTER_DIM_N) / (double)(atom->Nlocal * (param->ntimes + 1));
double avg_neigh_atom = (stats->num_neighs * CLUSTER_N) / (double)(atom->Nlocal * (param->ntimes + 1));
double avg_neigh_cluster = (double)(stats->num_neighs) / (double)(stats->calculated_forces);
double avg_simd = stats->force_iters / (double)(atom->Nlocal * (param->ntimes + 1));

View File

@ -27,9 +27,10 @@ int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep
fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
fprintf(fp, "POINTS %d double\n", atom->Nlocal);
for(int ci = 0; ci < atom->Nclusters_local; ++ci) {
MD_FLOAT *cptr = cluster_pos_ptr(ci);
for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
fprintf(fp, "%.4f %.4f %.4f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii));
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
}
}
fprintf(fp, "\n\n");
@ -70,9 +71,10 @@ int write_ghost_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep
fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
fprintf(fp, "POINTS %d double\n", atom->Nghost);
for(int ci = atom->Nclusters_local; ci < atom->Nclusters_local + atom->Nclusters_ghost; ++ci) {
MD_FLOAT *cptr = cluster_pos_ptr(ci);
for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
fprintf(fp, "%.4f %.4f %.4f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii));
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
}
}
fprintf(fp, "\n\n");
@ -116,18 +118,19 @@ int write_local_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int
fprintf(fp, "DATASET POLYDATA\n");
fprintf(fp, "POINTS %d double\n", atom->Nlocal);
for(int ci = 0; ci < N; ++ci) {
MD_FLOAT *cptr = cluster_pos_ptr(ci);
for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
fprintf(fp, "%.4f %.4f %.4f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii));
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
}
tot_lines += atom->clusters[ci].natoms;
tot_lines += atom->iclusters[ci].natoms;
}
fprintf(fp, "\n\n");
fprintf(fp, "LINES %d %d\n", N, N + tot_lines);
for(int ci = 0; ci < N; ++ci) {
fprintf(fp, "%d ", atom->clusters[ci].natoms);
for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
fprintf(fp, "%d ", atom->iclusters[ci].natoms);
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
fprintf(fp, "%d ", i++);
}
@ -157,18 +160,19 @@ int write_ghost_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int
fprintf(fp, "DATASET POLYDATA\n");
fprintf(fp, "POINTS %d double\n", atom->Nghost);
for(int ci = atom->Nclusters_local; ci < N; ++ci) {
MD_FLOAT *cptr = cluster_pos_ptr(ci);
for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
fprintf(fp, "%.4f %.4f %.4f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii));
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
}
tot_lines += atom->clusters[ci].natoms;
tot_lines += atom->iclusters[ci].natoms;
}
fprintf(fp, "\n\n");
fprintf(fp, "LINES %d %d\n", atom->Nclusters_ghost, atom->Nclusters_ghost + tot_lines);
for(int ci = atom->Nclusters_local; ci < N; ++ci) {
fprintf(fp, "%d ", atom->clusters[ci].natoms);
for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
fprintf(fp, "%d ", atom->iclusters[ci].natoms);
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
fprintf(fp, "%d ", i++);
}

View File

@ -52,11 +52,12 @@ void xtc_init(const char *filename, Atom *atom, int timestep) {
void xtc_write(Atom *atom, int timestep, int write_pos, int write_vel) {
int i = 0;
for(int ci = 0; ci < atom->Nclusters_local; ++ci) {
MD_FLOAT *cptr = cluster_pos_ptr(ci);
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
x_buf[i][XX] = cluster_x(cptr, cii);
x_buf[i][YY] = cluster_y(cptr, cii);
x_buf[i][ZZ] = cluster_z(cptr, cii);
x_buf[i][XX] = ci_x[CL_X_OFFSET + cii];
x_buf[i][YY] = ci_x[CL_Y_OFFSET + cii];
x_buf[i][ZZ] = ci_x[CL_Z_OFFSET + cii];
i++;
}
}

18
include_ISA.mk Normal file
View File

@ -0,0 +1,18 @@
ifeq ($(strip $(ISA)), SSE)
_VECTOR_WIDTH=2
else ifeq ($(strip $(ISA)), AVX)
# Vector width is 4 but AVX2 instruction set is not supported
NO_AVX2=true
_VECTOR_WIDTH=4
else ifeq ($(strip $(ISA)), AVX2)
_VECTOR_WIDTH=4
else ifeq ($(strip $(ISA)), AVX512)
AVX512=true
_VECTOR_WIDTH=8
endif
ifeq ($(strip $(DATA_TYPE)), SP)
VECTOR_WIDTH=$(shell echo $$(( $(_VECTOR_WIDTH) * 2 )))
else
VECTOR_WIDTH=$(_VECTOR_WIDTH)
endif

View File

@ -217,8 +217,6 @@ int main(int argc, const char *argv[]) {
if(!csv) {
printf("Number of timesteps: %d\n", param.ntimes);
printf("Number of times to compute the atoms loop: %d\n", ATOMS_LOOP_RUNS);
printf("Number of times to compute the neighbors loop: %d\n", NEIGHBORS_LOOP_RUNS);
printf("System size (unit cells): %dx%dx%d\n", param.nx, param.ny, param.nz);
printf("Atoms per unit cell: %d\n", atoms_per_unit_cell);
printf("Total number of atoms: %d\n", atom->Nlocal);
@ -257,9 +255,8 @@ int main(int argc, const char *argv[]) {
E = getTimeStamp();
double T_accum = E-S;
double freq_hz = param.proc_freq * 1.e9;
const double repeats = ATOMS_LOOP_RUNS * NEIGHBORS_LOOP_RUNS;
const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes * repeats);
const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes * repeats) * freq_hz;
const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes);
const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes) * freq_hz;
const double cycles_per_neigh = cycles_per_atom / (double)(atoms_per_unit_cell - 1);
if(!csv) {