commit
459853dc25
16
Makefile
16
Makefile
@ -10,15 +10,13 @@ Q ?= @
|
||||
include $(MAKE_DIR)/config.mk
|
||||
include $(MAKE_DIR)/include_$(TAG).mk
|
||||
include $(MAKE_DIR)/include_LIKWID.mk
|
||||
include $(MAKE_DIR)/include_ISA.mk
|
||||
include $(MAKE_DIR)/include_GROMACS.mk
|
||||
INCLUDES += -I./$(SRC_DIR)/includes
|
||||
|
||||
ifeq ($(strip $(DATA_LAYOUT)),AOS)
|
||||
DEFINES += -DAOS
|
||||
endif
|
||||
ifeq ($(strip $(CLUSTER_LAYOUT)),AOS)
|
||||
DEFINES += -DCLUSTER_AOS
|
||||
endif
|
||||
ifeq ($(strip $(DATA_TYPE)),SP)
|
||||
DEFINES += -DPRECISION=1
|
||||
else
|
||||
@ -29,14 +27,6 @@ ifneq ($(ASM_SYNTAX), ATT)
|
||||
ASFLAGS += -masm=intel
|
||||
endif
|
||||
|
||||
ifneq ($(ATOMS_LOOP_RUNS),)
|
||||
DEFINES += -DATOMS_LOOP_RUNS=$(ATOMS_LOOP_RUNS)
|
||||
endif
|
||||
|
||||
ifneq ($(NEIGHBORS_LOOP_RUNS),)
|
||||
DEFINES += -DNEIGHBORS_LOOP_RUNS=$(NEIGHBORS_LOOP_RUNS)
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(EXPLICIT_TYPES)),true)
|
||||
DEFINES += -DEXPLICIT_TYPES
|
||||
endif
|
||||
@ -73,6 +63,10 @@ ifeq ($(strip $(NO_AVX2)),true)
|
||||
DEFINES += -DNO_AVX2
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(AVX512)),true)
|
||||
DEFINES += -DAVX512
|
||||
endif
|
||||
|
||||
VPATH = $(SRC_DIR) $(ASM_DIR)
|
||||
ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
|
||||
OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s))
|
||||
|
14
config.mk
14
config.mk
@ -1,11 +1,13 @@
|
||||
# Compiler tag (GCC/CLANG/ICC)
|
||||
TAG ?= ICC
|
||||
# Instruction set (SSE/AVX/AVX2/AVX512)
|
||||
ISA ?= AVX512
|
||||
# Optimization scheme (lammps/gromacs/clusters_per_bin)
|
||||
OPT_SCHEME ?= gromacs
|
||||
# Enable likwid (true or false)
|
||||
ENABLE_LIKWID ?= true
|
||||
# SP or DP
|
||||
DATA_TYPE ?= DP
|
||||
DATA_TYPE ?= SP
|
||||
# AOS or SOA
|
||||
DATA_LAYOUT ?= AOS
|
||||
# Assembly syntax to generate (ATT/INTEL)
|
||||
@ -13,26 +15,16 @@ ASM_SYNTAX ?= ATT
|
||||
# Debug
|
||||
DEBUG ?= false
|
||||
|
||||
# Number of times to run the atoms loop on stubbed variant
|
||||
ATOMS_LOOP_RUNS ?= 1
|
||||
# Number of times to run the neighbors loop on stubbed variant
|
||||
NEIGHBORS_LOOP_RUNS ?= 1
|
||||
# Explicitly store and load atom types (true or false)
|
||||
EXPLICIT_TYPES ?= false
|
||||
# Trace memory addresses for cache simulator (true or false)
|
||||
MEM_TRACER ?= false
|
||||
# Trace indexes and distances for gather-md (true or false)
|
||||
INDEX_TRACER ?= false
|
||||
# Vector width (elements) for index and distance tracer
|
||||
VECTOR_WIDTH ?= 8
|
||||
# When vector width is 4 but AVX2 is not supported (AVX only), set this to true
|
||||
NO_AVX2 ?= false
|
||||
# Compute statistics
|
||||
COMPUTE_STATS ?= true
|
||||
|
||||
# Configurations for gromacs optimization scheme
|
||||
# AOS or SOA
|
||||
CLUSTER_LAYOUT ?= SOA
|
||||
# Use reference version
|
||||
USE_REFERENCE_VERSION ?= false
|
||||
# Enable XTC output
|
||||
|
@ -37,6 +37,7 @@ void initAtom(Atom *atom) {
|
||||
atom->cl_x = NULL;
|
||||
atom->cl_v = NULL;
|
||||
atom->cl_f = NULL;
|
||||
atom->cl_type = NULL;
|
||||
atom->Natoms = 0;
|
||||
atom->Nlocal = 0;
|
||||
atom->Nghost = 0;
|
||||
@ -51,7 +52,9 @@ void initAtom(Atom *atom) {
|
||||
atom->sigma6 = NULL;
|
||||
atom->cutforcesq = NULL;
|
||||
atom->cutneighsq = NULL;
|
||||
atom->clusters = NULL;
|
||||
atom->iclusters = NULL;
|
||||
atom->jclusters = NULL;
|
||||
atom->icluster_bin = NULL;
|
||||
}
|
||||
|
||||
void createAtom(Atom *atom, Parameter *param) {
|
||||
@ -426,9 +429,13 @@ void growAtom(Atom *atom) {
|
||||
|
||||
void growClusters(Atom *atom) {
|
||||
int nold = atom->Nclusters_max;
|
||||
int jterm = MAX(1, CLUSTER_M / CLUSTER_N); // If M>N, we need to allocate more j-clusters
|
||||
atom->Nclusters_max += DELTA;
|
||||
atom->clusters = (Cluster*) reallocate(atom->clusters, ALIGNMENT, atom->Nclusters_max * sizeof(Cluster), nold * sizeof(Cluster));
|
||||
atom->cl_x = (MD_FLOAT*) reallocate(atom->cl_x, ALIGNMENT, atom->Nclusters_max * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT));
|
||||
atom->cl_f = (MD_FLOAT*) reallocate(atom->cl_f, ALIGNMENT, atom->Nclusters_max * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT));
|
||||
atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT));
|
||||
atom->iclusters = (Cluster*) reallocate(atom->iclusters, ALIGNMENT, atom->Nclusters_max * sizeof(Cluster), nold * sizeof(Cluster));
|
||||
atom->jclusters = (Cluster*) reallocate(atom->jclusters, ALIGNMENT, atom->Nclusters_max * jterm * sizeof(Cluster), nold * jterm * sizeof(Cluster));
|
||||
atom->icluster_bin = (int*) reallocate(atom->icluster_bin, ALIGNMENT, atom->Nclusters_max * sizeof(int), nold * sizeof(int));
|
||||
atom->cl_x = (MD_FLOAT*) reallocate(atom->cl_x, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
atom->cl_f = (MD_FLOAT*) reallocate(atom->cl_f, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
atom->cl_type = (int*) reallocate(atom->cl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * sizeof(int), nold * CLUSTER_M * sizeof(int));
|
||||
}
|
||||
|
@ -41,11 +41,12 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
MD_FLOAT *fptr = cluster_force_ptr(ci);
|
||||
for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) {
|
||||
cluster_x(fptr, cii) = 0.0;
|
||||
cluster_y(fptr, cii) = 0.0;
|
||||
cluster_z(fptr, cii) = 0.0;
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
|
||||
ci_f[CL_X_OFFSET + cii] = 0.0;
|
||||
ci_f[CL_Y_OFFSET + cii] = 0.0;
|
||||
ci_f[CL_Z_OFFSET + cii] = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
@ -54,28 +55,39 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
|
||||
#pragma omp parallel for
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
MD_FLOAT *ciptr = cluster_pos_ptr(ci);
|
||||
MD_FLOAT *cifptr = cluster_force_ptr(ci);
|
||||
int ci_cj0 = CJ0_FROM_CI(ci);
|
||||
int ci_cj1 = CJ1_FROM_CI(ci);
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[ci];
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int cj = neighs[k];
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
int any = 0;
|
||||
MD_FLOAT *cjptr = cluster_pos_ptr(cj);
|
||||
for(int cii = 0; cii < CLUSTER_DIM_M; cii++) {
|
||||
MD_FLOAT xtmp = cluster_x(ciptr, cii);
|
||||
MD_FLOAT ytmp = cluster_y(ciptr, cii);
|
||||
MD_FLOAT ztmp = cluster_z(ciptr, cii);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
|
||||
for(int cii = 0; cii < CLUSTER_M; cii++) {
|
||||
MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii];
|
||||
MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii];
|
||||
MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii];
|
||||
MD_FLOAT fix = 0;
|
||||
MD_FLOAT fiy = 0;
|
||||
MD_FLOAT fiz = 0;
|
||||
|
||||
for(int cjj = 0; cjj < CLUSTER_DIM_M; cjj++) {
|
||||
if(ci != cj || cii != cjj) {
|
||||
MD_FLOAT delx = xtmp - cluster_x(cjptr, cjj);
|
||||
MD_FLOAT dely = ytmp - cluster_y(cjptr, cjj);
|
||||
MD_FLOAT delz = ztmp - cluster_z(cjptr, cjj);
|
||||
for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
|
||||
#if CLUSTER_M == CLUSTER_N
|
||||
if(ci_cj0 != cj || cii != cjj) {
|
||||
#elif CLUSTER_M < CLUSTER_N
|
||||
if(ci_cj0 != cj || cii + CLUSTER_M * (ci & 0x1) != cjj) {
|
||||
#else
|
||||
if((ci_cj0 != cj || cii != cjj) && (ci_cj1 != cj || cii != cjj + CLUSTER_N)) {
|
||||
#endif
|
||||
MD_FLOAT delx = xtmp - cj_x[CL_X_OFFSET + cjj];
|
||||
MD_FLOAT dely = ytmp - cj_x[CL_Y_OFFSET + cjj];
|
||||
MD_FLOAT delz = ztmp - cj_x[CL_Z_OFFSET + cjj];
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
if(rsq < cutforcesq) {
|
||||
MD_FLOAT sr2 = 1.0 / rsq;
|
||||
@ -98,15 +110,15 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
addStat(stats->clusters_outside_cutoff, 1);
|
||||
}
|
||||
|
||||
cluster_x(cifptr, cii) += fix;
|
||||
cluster_y(cifptr, cii) += fiy;
|
||||
cluster_z(cifptr, cii) += fiz;
|
||||
ci_f[CL_X_OFFSET + cii] += fix;
|
||||
ci_f[CL_Y_OFFSET + cii] += fiy;
|
||||
ci_f[CL_Z_OFFSET + cii] += fiz;
|
||||
}
|
||||
}
|
||||
|
||||
addStat(stats->calculated_forces, 1);
|
||||
addStat(stats->num_neighs, numneighs);
|
||||
addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_DIM_M / CLUSTER_DIM_N));
|
||||
addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
@ -115,6 +127,132 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
return E-S;
|
||||
}
|
||||
|
||||
double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
DEBUG_MESSAGE("computeForceLJ_2xnn begin\n");
|
||||
int Nlocal = atom->Nlocal;
|
||||
int* neighs;
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
|
||||
MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
|
||||
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
|
||||
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
||||
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
||||
const unsigned int half_mask_bits = VECTOR_WIDTH >> 1;
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
|
||||
ci_f[CL_X_OFFSET + cii] = 0.0;
|
||||
ci_f[CL_Y_OFFSET + cii] = 0.0;
|
||||
ci_f[CL_Z_OFFSET + cii] = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
double S = getTimeStamp();
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
#pragma omp parallel for
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[ci];
|
||||
|
||||
MD_SIMD_FLOAT xi0_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 0]);
|
||||
MD_SIMD_FLOAT xi2_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 2]);
|
||||
MD_SIMD_FLOAT yi0_tmp = simd_load_h_dual(&ci_x[CL_Y_OFFSET + 0]);
|
||||
MD_SIMD_FLOAT yi2_tmp = simd_load_h_dual(&ci_x[CL_Y_OFFSET + 2]);
|
||||
MD_SIMD_FLOAT zi0_tmp = simd_load_h_dual(&ci_x[CL_Z_OFFSET + 0]);
|
||||
MD_SIMD_FLOAT zi2_tmp = simd_load_h_dual(&ci_x[CL_Z_OFFSET + 2]);
|
||||
MD_SIMD_FLOAT fix0 = simd_zero();
|
||||
MD_SIMD_FLOAT fiy0 = simd_zero();
|
||||
MD_SIMD_FLOAT fiz0 = simd_zero();
|
||||
MD_SIMD_FLOAT fix2 = simd_zero();
|
||||
MD_SIMD_FLOAT fiy2 = simd_zero();
|
||||
MD_SIMD_FLOAT fiz2 = simd_zero();
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int cj = neighs[k];
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
unsigned int mask0, mask1, mask2, mask3;
|
||||
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]);
|
||||
MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]);
|
||||
MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]);
|
||||
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
|
||||
MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp);
|
||||
|
||||
#if CLUSTER_M == CLUSTER_N
|
||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||
mask0 = (unsigned int)(0xf - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0xf - 0x2 * cond0);
|
||||
mask2 = (unsigned int)(0xf - 0x4 * cond0);
|
||||
mask3 = (unsigned int)(0xf - 0x8 * cond0);
|
||||
#elif CLUSTER_M < CLUSTER_N
|
||||
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
|
||||
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
|
||||
mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1);
|
||||
mask1 = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1);
|
||||
mask2 = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1);
|
||||
mask3 = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1);
|
||||
#else
|
||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
|
||||
mask0 = (unsigned int)(0x3 - 0x1 * cond0);
|
||||
mask1 = (unsigned int)(0x3 - 0x2 * cond0);
|
||||
mask2 = (unsigned int)(0x3 - 0x1 * cond1);
|
||||
mask3 = (unsigned int)(0x3 - 0x2 * cond1);
|
||||
#endif
|
||||
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0);
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2);
|
||||
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
|
||||
MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2)));
|
||||
|
||||
MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec));
|
||||
MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec));
|
||||
|
||||
MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0);
|
||||
MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2);
|
||||
|
||||
MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec)));
|
||||
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
|
||||
|
||||
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
|
||||
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
|
||||
|
||||
fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
|
||||
fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
|
||||
fiz0 = simd_masked_add(fiz0, simd_mul(delz0, force0), cutoff_mask0);
|
||||
fix2 = simd_masked_add(fix2, simd_mul(delx2, force2), cutoff_mask2);
|
||||
fiy2 = simd_masked_add(fiy2, simd_mul(dely2, force2), cutoff_mask2);
|
||||
fiz2 = simd_masked_add(fiz2, simd_mul(delz2, force2), cutoff_mask2);
|
||||
}
|
||||
|
||||
simd_h_dual_reduce_sum(&ci_f[CL_X_OFFSET], fix0, fix2);
|
||||
simd_h_dual_reduce_sum(&ci_f[CL_Y_OFFSET], fiy0, fiy2);
|
||||
simd_h_dual_reduce_sum(&ci_f[CL_Z_OFFSET], fiz0, fiz2);
|
||||
|
||||
addStat(stats->calculated_forces, 1);
|
||||
addStat(stats->num_neighs, numneighs);
|
||||
addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
double E = getTimeStamp();
|
||||
DEBUG_MESSAGE("computeForceLJ_2xnn end\n");
|
||||
return E-S;
|
||||
}
|
||||
|
||||
double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
DEBUG_MESSAGE("computeForceLJ_4xn begin\n");
|
||||
@ -125,33 +263,36 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq);
|
||||
MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6);
|
||||
MD_SIMD_FLOAT epsilon_vec = simd_broadcast(epsilon);
|
||||
MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon);
|
||||
MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0);
|
||||
MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5);
|
||||
const int unroll_j = CLUSTER_DIM_N / CLUSTER_DIM_M;
|
||||
|
||||
double S = getTimeStamp();
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
#pragma omp parallel for
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
MD_FLOAT *ciptr = cluster_pos_ptr(ci);
|
||||
MD_FLOAT *cifptr = cluster_force_ptr(ci);
|
||||
int ci_cj0 = CJ0_FROM_CI(ci);
|
||||
#if CLUSTER_M > CLUSTER_N
|
||||
int ci_cj1 = CJ1_FROM_CI(ci);
|
||||
#endif
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
neighs = &neighbor->neighbors[ci * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[ci];
|
||||
|
||||
MD_SIMD_FLOAT xi0_tmp = simd_broadcast(cluster_x(ciptr, 0));
|
||||
MD_SIMD_FLOAT yi0_tmp = simd_broadcast(cluster_y(ciptr, 0));
|
||||
MD_SIMD_FLOAT zi0_tmp = simd_broadcast(cluster_z(ciptr, 0));
|
||||
MD_SIMD_FLOAT xi1_tmp = simd_broadcast(cluster_x(ciptr, 1));
|
||||
MD_SIMD_FLOAT yi1_tmp = simd_broadcast(cluster_y(ciptr, 1));
|
||||
MD_SIMD_FLOAT zi1_tmp = simd_broadcast(cluster_z(ciptr, 1));
|
||||
MD_SIMD_FLOAT xi2_tmp = simd_broadcast(cluster_x(ciptr, 2));
|
||||
MD_SIMD_FLOAT yi2_tmp = simd_broadcast(cluster_y(ciptr, 2));
|
||||
MD_SIMD_FLOAT zi2_tmp = simd_broadcast(cluster_z(ciptr, 2));
|
||||
MD_SIMD_FLOAT xi3_tmp = simd_broadcast(cluster_x(ciptr, 3));
|
||||
MD_SIMD_FLOAT yi3_tmp = simd_broadcast(cluster_y(ciptr, 3));
|
||||
MD_SIMD_FLOAT zi3_tmp = simd_broadcast(cluster_z(ciptr, 3));
|
||||
MD_SIMD_FLOAT xi0_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 0]);
|
||||
MD_SIMD_FLOAT xi1_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 1]);
|
||||
MD_SIMD_FLOAT xi2_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 2]);
|
||||
MD_SIMD_FLOAT xi3_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 3]);
|
||||
MD_SIMD_FLOAT yi0_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 0]);
|
||||
MD_SIMD_FLOAT yi1_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 1]);
|
||||
MD_SIMD_FLOAT yi2_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 2]);
|
||||
MD_SIMD_FLOAT yi3_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 3]);
|
||||
MD_SIMD_FLOAT zi0_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 0]);
|
||||
MD_SIMD_FLOAT zi1_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 1]);
|
||||
MD_SIMD_FLOAT zi2_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 2]);
|
||||
MD_SIMD_FLOAT zi3_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 3]);
|
||||
MD_SIMD_FLOAT fix0 = simd_zero();
|
||||
MD_SIMD_FLOAT fiy0 = simd_zero();
|
||||
MD_SIMD_FLOAT fiz0 = simd_zero();
|
||||
@ -165,24 +306,13 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
MD_SIMD_FLOAT fiy3 = simd_zero();
|
||||
MD_SIMD_FLOAT fiz3 = simd_zero();
|
||||
|
||||
for(int k = 0; k < numneighs; k += unroll_j) {
|
||||
int cj0 = neighs[k + 0];
|
||||
unsigned int cond0 = (unsigned int)(ci == cj0);
|
||||
MD_FLOAT *cj0_ptr = cluster_pos_ptr(cj0);
|
||||
|
||||
#if VECTOR_WIDTH == 8
|
||||
int cj1 = neighs[k + 1];
|
||||
unsigned int cond1 = (unsigned int)(ci == cj1);
|
||||
MD_FLOAT *cj1_ptr = cluster_pos_ptr(cj1);
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load2(cj0_ptr, cj1_ptr, 0);
|
||||
MD_SIMD_FLOAT yj_tmp = simd_load2(cj0_ptr, cj1_ptr, 1);
|
||||
MD_SIMD_FLOAT zj_tmp = simd_load2(cj0_ptr, cj1_ptr, 2);
|
||||
#else
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load(cj0_ptr, 0);
|
||||
MD_SIMD_FLOAT yj_tmp = simd_load(cj0_ptr, 1);
|
||||
MD_SIMD_FLOAT zj_tmp = simd_load(cj0_ptr, 2);
|
||||
#endif
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int cj = neighs[k];
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]);
|
||||
MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]);
|
||||
MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]);
|
||||
MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp);
|
||||
MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp);
|
||||
@ -196,16 +326,26 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp);
|
||||
MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp);
|
||||
|
||||
#if VECTOR_WIDTH == 8
|
||||
#if CLUSTER_M == CLUSTER_N
|
||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0));
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0));
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0));
|
||||
#elif CLUSTER_M < CLUSTER_N
|
||||
unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci);
|
||||
unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci);
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1));
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1));
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1));
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1));
|
||||
#else
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0));
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0));
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0));
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0));
|
||||
unsigned int cond0 = (unsigned int)(cj == ci_cj0);
|
||||
unsigned int cond1 = (unsigned int)(cj == ci_cj1);
|
||||
MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond0));
|
||||
MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0x3 - 0x2 * cond0));
|
||||
MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond1));
|
||||
MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0x3 - 0x2 * cond1));
|
||||
#endif
|
||||
|
||||
MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0)));
|
||||
@ -228,10 +368,10 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec)));
|
||||
MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec)));
|
||||
|
||||
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, epsilon_vec))));
|
||||
MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, epsilon_vec))));
|
||||
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, epsilon_vec))));
|
||||
MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, epsilon_vec))));
|
||||
MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec))));
|
||||
MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, eps_vec))));
|
||||
MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec))));
|
||||
MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, eps_vec))));
|
||||
|
||||
fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0);
|
||||
fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0);
|
||||
@ -247,22 +387,22 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat
|
||||
fiz3 = simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3);
|
||||
}
|
||||
|
||||
cluster_x(cifptr, 0) = simd_horizontal_sum(fix0);
|
||||
cluster_y(cifptr, 0) = simd_horizontal_sum(fiy0);
|
||||
cluster_z(cifptr, 0) = simd_horizontal_sum(fiz0);
|
||||
cluster_x(cifptr, 1) = simd_horizontal_sum(fix1);
|
||||
cluster_y(cifptr, 1) = simd_horizontal_sum(fiy1);
|
||||
cluster_z(cifptr, 1) = simd_horizontal_sum(fiz1);
|
||||
cluster_x(cifptr, 2) = simd_horizontal_sum(fix2);
|
||||
cluster_y(cifptr, 2) = simd_horizontal_sum(fiy2);
|
||||
cluster_z(cifptr, 2) = simd_horizontal_sum(fiz2);
|
||||
cluster_x(cifptr, 3) = simd_horizontal_sum(fix3);
|
||||
cluster_y(cifptr, 3) = simd_horizontal_sum(fiy3);
|
||||
cluster_z(cifptr, 3) = simd_horizontal_sum(fiz3);
|
||||
ci_f[CL_X_OFFSET + 0] = simd_h_reduce_sum(fix0);
|
||||
ci_f[CL_X_OFFSET + 1] = simd_h_reduce_sum(fix1);
|
||||
ci_f[CL_X_OFFSET + 2] = simd_h_reduce_sum(fix2);
|
||||
ci_f[CL_X_OFFSET + 3] = simd_h_reduce_sum(fix3);
|
||||
ci_f[CL_Y_OFFSET + 0] = simd_h_reduce_sum(fiy0);
|
||||
ci_f[CL_Y_OFFSET + 1] = simd_h_reduce_sum(fiy1);
|
||||
ci_f[CL_Y_OFFSET + 2] = simd_h_reduce_sum(fiy2);
|
||||
ci_f[CL_Y_OFFSET + 3] = simd_h_reduce_sum(fiy3);
|
||||
ci_f[CL_Z_OFFSET + 0] = simd_h_reduce_sum(fiz0);
|
||||
ci_f[CL_Z_OFFSET + 1] = simd_h_reduce_sum(fiz1);
|
||||
ci_f[CL_Z_OFFSET + 2] = simd_h_reduce_sum(fiz2);
|
||||
ci_f[CL_Z_OFFSET + 3] = simd_h_reduce_sum(fiz3);
|
||||
|
||||
addStat(stats->calculated_forces, 1);
|
||||
addStat(stats->num_neighs, numneighs);
|
||||
addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_DIM_M / CLUSTER_DIM_N));
|
||||
addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N));
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
|
@ -27,13 +27,71 @@
|
||||
|
||||
#define DELTA 20000
|
||||
|
||||
#define CLUSTER_DIM_M 4
|
||||
#define CLUSTER_DIM_N VECTOR_WIDTH
|
||||
#define CLUSTER_M 4
|
||||
|
||||
// Nbnxn layouts (as of GROMACS):
|
||||
// Simd4xN: M=4, N=VECTOR_WIDTH
|
||||
// Simd2xNN: M=4, N=(VECTOR_WIDTH/2)
|
||||
|
||||
// Simd2xNN (here used for single-precision)
|
||||
#if VECTOR_WIDTH > CLUSTER_M * 2
|
||||
# define KERNEL_NAME "Simd2xNN"
|
||||
# define CLUSTER_N (VECTOR_WIDTH / 2)
|
||||
# define computeForceLJ computeForceLJ_2xnn
|
||||
// Simd4xN
|
||||
#else
|
||||
# define KERNEL_NAME "Simd4xN"
|
||||
# define CLUSTER_N VECTOR_WIDTH
|
||||
# define computeForceLJ computeForceLJ_4xn
|
||||
#endif
|
||||
|
||||
#ifdef USE_REFERENCE_VERSION
|
||||
# undef KERNEL_NAME
|
||||
# undef computeForceLJ
|
||||
# define KERNEL_NAME "Reference"
|
||||
# define computeForceLJ computeForceLJ_ref
|
||||
#endif
|
||||
|
||||
#if CLUSTER_M == CLUSTER_N
|
||||
# define CJ0_FROM_CI(a) (a)
|
||||
# define CJ1_FROM_CI(a) (a)
|
||||
# define CI_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
|
||||
# define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
|
||||
#elif CLUSTER_M == CLUSTER_N * 2 // M > N
|
||||
# define CJ0_FROM_CI(a) ((a) << 1)
|
||||
# define CJ1_FROM_CI(a) (((a) << 1) | 0x1)
|
||||
# define CI_BASE_INDEX(a,b) ((a) * CLUSTER_M * (b))
|
||||
# define CJ_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_M * (b) + ((a) & 0x1) * (CLUSTER_M >> 1))
|
||||
#elif CLUSTER_M == CLUSTER_N / 2 // M < N
|
||||
# define CJ0_FROM_CI(a) ((a) >> 1)
|
||||
# define CJ1_FROM_CI(a) ((a) >> 1)
|
||||
# define CI_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_N * (b) + ((a) & 0x1) * (CLUSTER_N >> 1))
|
||||
# define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
|
||||
#else
|
||||
# error "Invalid cluster configuration!"
|
||||
#endif
|
||||
|
||||
#if CLUSTER_N != 2 && CLUSTER_N != 4 && CLUSTER_N != 8
|
||||
# error "Cluster N dimension can be only 2, 4 and 8"
|
||||
#endif
|
||||
|
||||
#define CI_SCALAR_BASE_INDEX(a) (CI_BASE_INDEX(a, 1))
|
||||
#define CI_VECTOR_BASE_INDEX(a) (CI_BASE_INDEX(a, 3))
|
||||
#define CJ_SCALAR_BASE_INDEX(a) (CJ_BASE_INDEX(a, 1))
|
||||
#define CJ_VECTOR_BASE_INDEX(a) (CJ_BASE_INDEX(a, 3))
|
||||
|
||||
#if CLUSTER_M >= CLUSTER_N
|
||||
# define CL_X_OFFSET (0 * CLUSTER_M)
|
||||
# define CL_Y_OFFSET (1 * CLUSTER_M)
|
||||
# define CL_Z_OFFSET (2 * CLUSTER_M)
|
||||
#else
|
||||
# define CL_X_OFFSET (0 * CLUSTER_N)
|
||||
# define CL_Y_OFFSET (1 * CLUSTER_N)
|
||||
# define CL_Z_OFFSET (2 * CLUSTER_N)
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
int bin;
|
||||
int natoms;
|
||||
int type[CLUSTER_DIM_M];
|
||||
MD_FLOAT bbminx, bbmaxx;
|
||||
MD_FLOAT bbminy, bbmaxy;
|
||||
MD_FLOAT bbminz, bbmaxz;
|
||||
@ -44,9 +102,6 @@ typedef struct {
|
||||
int Nclusters, Nclusters_local, Nclusters_ghost, Nclusters_max;
|
||||
MD_FLOAT *x, *y, *z;
|
||||
MD_FLOAT *vx, *vy, *vz;
|
||||
MD_FLOAT *cl_x;
|
||||
MD_FLOAT *cl_v;
|
||||
MD_FLOAT *cl_f;
|
||||
int *border_map;
|
||||
int *type;
|
||||
int ntypes;
|
||||
@ -54,8 +109,15 @@ typedef struct {
|
||||
MD_FLOAT *sigma6;
|
||||
MD_FLOAT *cutforcesq;
|
||||
MD_FLOAT *cutneighsq;
|
||||
Cluster *clusters;
|
||||
int *PBCx, *PBCy, *PBCz;
|
||||
// Data in cluster format
|
||||
MD_FLOAT *cl_x;
|
||||
MD_FLOAT *cl_v;
|
||||
MD_FLOAT *cl_f;
|
||||
int *cl_type;
|
||||
Cluster *iclusters, *jclusters;
|
||||
int *icluster_bin;
|
||||
int dummy_cj;
|
||||
} Atom;
|
||||
|
||||
extern void initAtom(Atom*);
|
||||
@ -67,10 +129,6 @@ extern int readAtom_dmp(Atom*, Parameter*);
|
||||
extern void growAtom(Atom*);
|
||||
extern void growClusters(Atom*);
|
||||
|
||||
#define cluster_pos_ptr(ci) &(atom->cl_x[(ci) * CLUSTER_DIM_M * 3])
|
||||
#define cluster_velocity_ptr(ci) &(atom->cl_v[(ci) * CLUSTER_DIM_M * 3])
|
||||
#define cluster_force_ptr(ci) &(atom->cl_f[(ci) * CLUSTER_DIM_M * 3])
|
||||
|
||||
#ifdef AOS
|
||||
#define POS_DATA_LAYOUT "AoS"
|
||||
#define atom_x(i) atom->x[(i) * 3 + 0]
|
||||
@ -83,14 +141,4 @@ extern void growClusters(Atom*);
|
||||
#define atom_z(i) atom->z[i]
|
||||
#endif
|
||||
|
||||
#ifdef CLUSTER_AOS
|
||||
#define cluster_x(cptr, i) cptr[(i) * 3 + 0]
|
||||
#define cluster_y(cptr, i) cptr[(i) * 3 + 1]
|
||||
#define cluster_z(cptr, i) cptr[(i) * 3 + 2]
|
||||
#else
|
||||
#define cluster_x(cptr, i) cptr[0 * CLUSTER_DIM_M + (i)]
|
||||
#define cluster_y(cptr, i) cptr[1 * CLUSTER_DIM_M + (i)]
|
||||
#define cluster_z(cptr, i) cptr[2 * CLUSTER_DIM_M + (i)]
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
@ -40,6 +40,7 @@ extern void buildNeighbor(Atom*, Neighbor*);
|
||||
extern void pruneNeighbor(Parameter*, Atom*, Neighbor*);
|
||||
extern void sortAtom(Atom*);
|
||||
extern void buildClusters(Atom*);
|
||||
extern void defineJClusters(Atom*);
|
||||
extern void binClusters(Atom*);
|
||||
extern void updateSingleAtoms(Atom*);
|
||||
#endif
|
||||
|
@ -21,6 +21,7 @@
|
||||
* =======================================================================================
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <immintrin.h>
|
||||
#include <zmmintrin.h>
|
||||
@ -28,7 +29,9 @@
|
||||
#define SIMD_PRINT_REAL(a) simd_print_real(#a, a);
|
||||
#define SIMD_PRINT_MASK(a) simd_print_mask(#a, a);
|
||||
|
||||
#if VECTOR_WIDTH == 8 // AVX512
|
||||
#ifdef AVX512
|
||||
|
||||
#if PRECISION == 2 // Double precision
|
||||
|
||||
#define MD_SIMD_FLOAT __m512d
|
||||
#define MD_SIMD_MASK __mmask8
|
||||
@ -45,30 +48,100 @@ static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { retur
|
||||
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ); }
|
||||
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
|
||||
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
|
||||
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_pd(p); }
|
||||
|
||||
static MD_SIMD_FLOAT simd_load2(MD_FLOAT *c0, MD_FLOAT *c1, int d) {
|
||||
MD_SIMD_FLOAT x;
|
||||
#ifdef CLUSTER_AOS
|
||||
__m256i aos_gather_vindex = _mm256_set_epi32(9, 6, 3, 0, 9, 6, 3, 0);
|
||||
__m256i vindex = _mm256_add_epi32(aos_gather_vindex, _mm256_set1_epi32(d));
|
||||
x = _mm512_mask_i32gather_pd(simd_zero(), simd_mask_from_u32(0x0f), vindex, c0, sizeof(double));
|
||||
x = _mm512_mask_i32gather_pd(x, simd_mask_from_u32(0xf0), vindex, c1, sizeof(double));
|
||||
#else
|
||||
x = _mm512_load_pd(&c0[d * CLUSTER_DIM_M]);
|
||||
x = _mm512_insertf64x4(x, _mm256_load_pd(&c1[d * CLUSTER_DIM_M]), 1);
|
||||
#endif
|
||||
return x;
|
||||
}
|
||||
|
||||
static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
|
||||
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
|
||||
MD_SIMD_FLOAT x = _mm512_add_pd(a, _mm512_shuffle_f64x2(a, a, 0xee));
|
||||
x = _mm512_add_pd(x, _mm512_shuffle_f64x2(x, x, 0x11));
|
||||
x = _mm512_add_pd(x, _mm512_permute_pd(x, 0x01));
|
||||
return *((MD_FLOAT *) &x);
|
||||
}
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const double* m) {
|
||||
return _mm512_broadcast_f64x4(_mm256_load_pd(m));
|
||||
}
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_load_h_dual(const double* m) {
|
||||
return _mm512_insertf64x4(_mm512_broadcastsd_pd(_mm_load_sd(m)), _mm256_broadcastsd_pd(_mm_load_sd(m + 1)), 1);
|
||||
}
|
||||
|
||||
static inline double simd_h_dual_reduce_sum(double* m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
|
||||
__m512d t0;
|
||||
__m256d t2, t3;
|
||||
|
||||
t0 = _mm512_add_pd(v0, _mm512_permutex_pd(v0, 0x4e));
|
||||
t0 = _mm512_mask_add_pd(t0, simd_mask_from_u32(0xccul), v1, _mm512_permutex_pd(v1, 0x4e));
|
||||
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1));
|
||||
t0 = _mm512_mask_shuffle_f64x2(t0, simd_mask_from_u32(0xaaul), t0, t0, 0xee);
|
||||
t2 = _mm512_castpd512_pd256(t0);
|
||||
t3 = _mm256_load_pd(m);
|
||||
t3 = _mm256_add_pd(t3, t2);
|
||||
_mm256_store_pd(m, t3);
|
||||
|
||||
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0x4e));
|
||||
t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1));
|
||||
return _mm_cvtsd_f64(_mm512_castpd512_pd128(t0));
|
||||
}
|
||||
|
||||
#else // Single-precision
|
||||
|
||||
#define MD_SIMD_FLOAT __m512
|
||||
#define MD_SIMD_MASK __mmask16
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm512_set1_ps(scalar); }
|
||||
static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_ps(0.0f); }
|
||||
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_ps(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_sub_ps(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_mul_ps(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm512_fmadd_ps(a, b, c); }
|
||||
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm512_rcp14_ps(a); }
|
||||
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm512_mask_add_ps(a, m, a, b); }
|
||||
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask16(a, b); }
|
||||
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ); }
|
||||
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask16(a); }
|
||||
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask16_u32(a); }
|
||||
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_ps(p); }
|
||||
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
|
||||
// This would only be called in a Mx16 configuration, which is not valid in GROMACS
|
||||
fprintf(stderr, "simd_h_reduce_sum(): Called with AVX512 intrinsics and single-precision which is not valid!\n");
|
||||
exit(-1);
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_load_h_duplicate(const float* m) {
|
||||
return _mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_load_pd((const double *)(m))));
|
||||
}
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_load_h_dual(const float* m) {
|
||||
return _mm512_shuffle_f32x4(_mm512_broadcastss_ps(_mm_load_ss(m)), _mm512_broadcastss_ps(_mm_load_ss(m + 1)), 0x44);
|
||||
}
|
||||
|
||||
static inline MD_FLOAT simd_h_dual_reduce_sum(float* m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) {
|
||||
__m512 t0, t1;
|
||||
__m128 t2, t3;
|
||||
|
||||
t0 = _mm512_shuffle_f32x4(v0, v1, 0x88);
|
||||
t1 = _mm512_shuffle_f32x4(v0, v1, 0xdd);
|
||||
t0 = _mm512_add_ps(t0, t1);
|
||||
t0 = _mm512_add_ps(t0, _mm512_permute_ps(t0, 0x4e));
|
||||
t0 = _mm512_add_ps(t0, _mm512_permute_ps(t0, 0xb1));
|
||||
t0 = _mm512_maskz_compress_ps(simd_mask_from_u32(0x1111ul), t0);
|
||||
t3 = _mm512_castps512_ps128(t0);
|
||||
t2 = _mm_load_ps(m);
|
||||
t2 = _mm_add_ps(t2, t3);
|
||||
_mm_store_ps(m, t2);
|
||||
|
||||
t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0x4e));
|
||||
t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0xb1));
|
||||
return _mm_cvtss_f32(t3);
|
||||
}
|
||||
|
||||
#endif // PRECISION
|
||||
|
||||
#else // AVX or AVX2
|
||||
|
||||
#if PRECISION == 2 // Double precision
|
||||
|
||||
#define MD_SIMD_FLOAT __m256d
|
||||
|
||||
#ifdef NO_AVX2
|
||||
@ -82,8 +155,10 @@ static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_pd(0.0); }
|
||||
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_pd(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_pd(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_pd(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_pd(p); }
|
||||
|
||||
#ifdef NO_AVX2
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(a))); }
|
||||
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return simd_add(simd_mul(a, b), c); }
|
||||
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return simd_add(a, _mm256_and_pd(b, m)); }
|
||||
@ -97,7 +172,7 @@ static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) {
|
||||
}
|
||||
// TODO: Implement this, althrough it is just required for debugging
|
||||
static inline int simd_mask_to_u32(MD_SIMD_MASK a) { return 0; }
|
||||
static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
|
||||
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
|
||||
__m128d a0, a1;
|
||||
a = _mm256_add_pd(a, _mm256_permute_pd(a, 0b0101));
|
||||
a0 = _mm256_castpd256_pd128(a);
|
||||
@ -105,7 +180,9 @@ static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
|
||||
a0 = _mm_add_sd(a0, a1);
|
||||
return *((MD_FLOAT *) &a0);
|
||||
}
|
||||
#else
|
||||
|
||||
#else // AVX2
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_pd(a); }
|
||||
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_pd(a, b, c); }
|
||||
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_pd(a, m, a, b); }
|
||||
@ -113,7 +190,7 @@ static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) {
|
||||
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); }
|
||||
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
|
||||
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
|
||||
static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
|
||||
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
|
||||
__m128d a0, a1;
|
||||
// test with shuffle & add as an alternative to hadd later
|
||||
a = _mm256_hadd_pd(a, a);
|
||||
@ -122,24 +199,52 @@ static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) {
|
||||
a0 = _mm_add_sd(a0, a1);
|
||||
return *((MD_FLOAT *) &a0);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static MD_SIMD_FLOAT simd_load(MD_FLOAT *c0, int d) {
|
||||
MD_SIMD_FLOAT x;
|
||||
#ifdef CLUSTER_AOS
|
||||
#else // Single-precision
|
||||
|
||||
#define MD_SIMD_FLOAT __m256
|
||||
|
||||
#ifdef NO_AVX2
|
||||
#error "Not possible to use AoS cluster layout without AVX2 support!"
|
||||
#endif
|
||||
__m128i aos_gather_vindex = _mm128_set_epi32(9, 6, 3, 0);
|
||||
__m128i vindex = _mm128_add_epi32(aos_gather_vindex, _mm128_set1_epi32(d));
|
||||
x = _mm256_i32gather_pd(c0, vindex, sizeof(double));
|
||||
#define MD_SIMD_MASK __m256
|
||||
#else
|
||||
x = _mm256_load_pd(&c0[d * CLUSTER_DIM_M]);
|
||||
#define MD_SIMD_MASK __mmask8
|
||||
#endif
|
||||
return x;
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm256_set1_ps(scalar); }
|
||||
static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_ps(0.0); }
|
||||
static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_ps(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_ps(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_ps(a, b); }
|
||||
static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_ps(p); }
|
||||
|
||||
#ifdef NO_AVX2
|
||||
|
||||
#error "AVX intrinsincs with single-precision not implemented!"
|
||||
|
||||
#else // AVX2
|
||||
|
||||
static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_ps(a); }
|
||||
static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_ps(a, b, c); }
|
||||
static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_ps(a, m, a, b); }
|
||||
static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_pd_mask(a, b, _CMP_LT_OQ); }
|
||||
static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); }
|
||||
static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); }
|
||||
static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); }
|
||||
static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) {
|
||||
__m128 t0;
|
||||
t0 = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 0x1));
|
||||
t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2)));
|
||||
t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1)));
|
||||
return *((MD_FLOAT *) &t0);
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif // NO_AVX2
|
||||
|
||||
#endif // PRECISION
|
||||
|
||||
#endif // AVX or AVX2
|
||||
|
||||
static inline void simd_print_real(const char *ref, MD_SIMD_FLOAT a) {
|
||||
double x[VECTOR_WIDTH];
|
||||
|
@ -217,8 +217,6 @@ int main(int argc, const char *argv[]) {
|
||||
|
||||
if(!csv) {
|
||||
printf("Number of timesteps: %d\n", param.ntimes);
|
||||
printf("Number of times to compute the atoms loop: %d\n", ATOMS_LOOP_RUNS);
|
||||
printf("Number of times to compute the neighbors loop: %d\n", NEIGHBORS_LOOP_RUNS);
|
||||
printf("System size (unit cells): %dx%dx%d\n", param.nx, param.ny, param.nz);
|
||||
printf("Atoms per unit cell: %d\n", atoms_per_unit_cell);
|
||||
printf("Total number of atoms: %d\n", atom->Nlocal);
|
||||
@ -257,9 +255,8 @@ int main(int argc, const char *argv[]) {
|
||||
E = getTimeStamp();
|
||||
double T_accum = E-S;
|
||||
double freq_hz = param.proc_freq * 1.e9;
|
||||
const double repeats = ATOMS_LOOP_RUNS * NEIGHBORS_LOOP_RUNS;
|
||||
const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes * repeats);
|
||||
const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes * repeats) * freq_hz;
|
||||
const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes);
|
||||
const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes) * freq_hz;
|
||||
const double cycles_per_neigh = cycles_per_atom / (double)(atoms_per_unit_cell - 1);
|
||||
|
||||
if(!csv) {
|
||||
|
@ -43,14 +43,9 @@
|
||||
|
||||
extern double computeForceLJ_ref(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceLJ_4xn(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceLJ_2xnn(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
|
||||
|
||||
#ifdef USE_REFERENCE_VERSION
|
||||
# define computeForceLJ computeForceLJ_ref
|
||||
#else
|
||||
# define computeForceLJ computeForceLJ_4xn
|
||||
#endif
|
||||
|
||||
double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
if(param->force_field == FF_EAM) { initEam(eam, param); }
|
||||
double S, E;
|
||||
@ -75,6 +70,7 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *
|
||||
setupThermo(param, atom->Natoms);
|
||||
if(param->input_file == NULL) { adjustThermo(param, atom); }
|
||||
buildClusters(atom);
|
||||
defineJClusters(atom);
|
||||
setupPbc(atom, param);
|
||||
binClusters(atom);
|
||||
buildNeighbor(atom, neighbor);
|
||||
@ -89,6 +85,7 @@ double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
updateSingleAtoms(atom);
|
||||
updateAtomsPbc(atom, param);
|
||||
buildClusters(atom);
|
||||
defineJClusters(atom);
|
||||
setupPbc(atom, param);
|
||||
binClusters(atom);
|
||||
buildNeighbor(atom, neighbor);
|
||||
@ -101,17 +98,18 @@ void initialIntegrate(Parameter *param, Atom *atom) {
|
||||
DEBUG_MESSAGE("initialIntegrate start\n");
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
MD_FLOAT *ciptr = cluster_pos_ptr(ci);
|
||||
MD_FLOAT *civptr = cluster_velocity_ptr(ci);
|
||||
MD_FLOAT *cifptr = cluster_force_ptr(ci);
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
|
||||
for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) {
|
||||
cluster_x(civptr, cii) += param->dtforce * cluster_x(cifptr, cii);
|
||||
cluster_y(civptr, cii) += param->dtforce * cluster_y(cifptr, cii);
|
||||
cluster_z(civptr, cii) += param->dtforce * cluster_z(cifptr, cii);
|
||||
cluster_x(ciptr, cii) += param->dt * cluster_x(civptr, cii);
|
||||
cluster_y(ciptr, cii) += param->dt * cluster_y(civptr, cii);
|
||||
cluster_z(ciptr, cii) += param->dt * cluster_z(civptr, cii);
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
|
||||
ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii];
|
||||
ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii];
|
||||
ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii];
|
||||
ci_x[CL_X_OFFSET + cii] += param->dt * ci_v[CL_X_OFFSET + cii];
|
||||
ci_x[CL_Y_OFFSET + cii] += param->dt * ci_v[CL_Y_OFFSET + cii];
|
||||
ci_x[CL_Z_OFFSET + cii] += param->dt * ci_v[CL_Z_OFFSET + cii];
|
||||
}
|
||||
}
|
||||
|
||||
@ -122,13 +120,14 @@ void finalIntegrate(Parameter *param, Atom *atom) {
|
||||
DEBUG_MESSAGE("finalIntegrate start\n");
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
MD_FLOAT *civptr = cluster_velocity_ptr(ci);
|
||||
MD_FLOAT *cifptr = cluster_force_ptr(ci);
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
|
||||
for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) {
|
||||
cluster_x(civptr, cii) += param->dtforce * cluster_x(cifptr, cii);
|
||||
cluster_y(civptr, cii) += param->dtforce * cluster_y(cifptr, cii);
|
||||
cluster_z(civptr, cii) += param->dtforce * cluster_z(cifptr, cii);
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
|
||||
ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii];
|
||||
ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii];
|
||||
ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii];
|
||||
}
|
||||
}
|
||||
|
||||
@ -316,6 +315,7 @@ int main(int argc, char** argv) {
|
||||
}
|
||||
|
||||
timer[TOTAL] = getTimeStamp() - timer[TOTAL];
|
||||
updateSingleAtoms(&atom);
|
||||
computeThermo(-1, ¶m, &atom);
|
||||
|
||||
if(param.xtc_file != NULL) {
|
||||
@ -323,6 +323,7 @@ int main(int argc, char** argv) {
|
||||
}
|
||||
|
||||
printf(HLINE);
|
||||
printf("Kernel: %s, MxN: %dx%d, Vector width: %d\n", KERNEL_NAME, CLUSTER_M, CLUSTER_N, VECTOR_WIDTH);
|
||||
printf("Data layout for positions: %s\n", POS_DATA_LAYOUT);
|
||||
#if PRECISION == 1
|
||||
printf("Using single precision floating point.\n");
|
||||
|
@ -39,8 +39,8 @@ static int nbinx, nbiny;
|
||||
static int mbinx, mbiny; // n bins in x, y
|
||||
static int *bincount;
|
||||
static int *bins;
|
||||
static int *cluster_bincount;
|
||||
static int *cluster_bins;
|
||||
static int *bin_nclusters;
|
||||
static int *bin_clusters;
|
||||
static int mbins; //total number of bins
|
||||
static int atoms_per_bin; // max atoms per bin
|
||||
static int clusters_per_bin; // max clusters per bin
|
||||
@ -63,12 +63,12 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) {
|
||||
cutneigh = param->cutneigh;
|
||||
nmax = 0;
|
||||
atoms_per_bin = 8;
|
||||
clusters_per_bin = (atoms_per_bin / CLUSTER_DIM_M) + 4;
|
||||
clusters_per_bin = (atoms_per_bin / CLUSTER_M) + 10;
|
||||
stencil = NULL;
|
||||
bins = NULL;
|
||||
bincount = NULL;
|
||||
cluster_bins = NULL;
|
||||
cluster_bincount = NULL;
|
||||
bin_clusters = NULL;
|
||||
bin_nclusters = NULL;
|
||||
neighbor->maxneighs = 100;
|
||||
neighbor->numneigh = NULL;
|
||||
neighbor->neighbors = NULL;
|
||||
@ -91,7 +91,7 @@ void setupNeighbor(Parameter *param, Atom *atom) {
|
||||
MD_FLOAT zlo = 0.0; MD_FLOAT zhi = zprd;
|
||||
|
||||
MD_FLOAT atom_density = ((MD_FLOAT)(atom->Nlocal)) / ((xhi - xlo) * (yhi - ylo) * (zhi - zlo));
|
||||
MD_FLOAT atoms_in_cell = MAX(CLUSTER_DIM_M, CLUSTER_DIM_N);
|
||||
MD_FLOAT atoms_in_cell = MAX(CLUSTER_M, CLUSTER_N);
|
||||
MD_FLOAT targetsizex = cbrt(atoms_in_cell / atom_density);
|
||||
MD_FLOAT targetsizey = cbrt(atoms_in_cell / atom_density);
|
||||
nbinx = MAX(1, (int)ceil((xhi - xlo) / targetsizex));
|
||||
@ -104,17 +104,13 @@ void setupNeighbor(Parameter *param, Atom *atom) {
|
||||
|
||||
coord = xlo - cutneigh - SMALL * xprd;
|
||||
mbinxlo = (int)(coord * bininvx);
|
||||
if (coord < 0.0) {
|
||||
mbinxlo = mbinxlo - 1;
|
||||
}
|
||||
if(coord < 0.0) { mbinxlo = mbinxlo - 1; }
|
||||
coord = xhi + cutneigh + SMALL * xprd;
|
||||
mbinxhi = (int)(coord * bininvx);
|
||||
|
||||
coord = ylo - cutneigh - SMALL * yprd;
|
||||
mbinylo = (int)(coord * bininvy);
|
||||
if (coord < 0.0) {
|
||||
mbinylo = mbinylo - 1;
|
||||
}
|
||||
if(coord < 0.0) { mbinylo = mbinylo - 1; }
|
||||
coord = yhi + cutneigh + SMALL * yprd;
|
||||
mbinyhi = (int)(coord * bininvy);
|
||||
|
||||
@ -127,15 +123,11 @@ void setupNeighbor(Parameter *param, Atom *atom) {
|
||||
mbiny = mbinyhi - mbinylo + 1;
|
||||
|
||||
nextx = (int)(cutneigh * bininvx);
|
||||
if(nextx * binsizex < FACTOR * cutneigh) nextx++;
|
||||
|
||||
nexty = (int)(cutneigh * bininvy);
|
||||
if(nextx * binsizex < FACTOR * cutneigh) nextx++;
|
||||
if(nexty * binsizey < FACTOR * cutneigh) nexty++;
|
||||
|
||||
if (stencil) {
|
||||
free(stencil);
|
||||
}
|
||||
|
||||
if (stencil) { free(stencil); }
|
||||
stencil = (int *) malloc((2 * nexty + 1) * (2 * nextx + 1) * sizeof(int));
|
||||
nstencil = 0;
|
||||
|
||||
@ -147,19 +139,15 @@ void setupNeighbor(Parameter *param, Atom *atom) {
|
||||
}
|
||||
}
|
||||
|
||||
mbins = mbinx * mbiny;
|
||||
|
||||
if(bincount) { free(bincount); }
|
||||
bincount = (int*) malloc(mbins * sizeof(int));
|
||||
|
||||
if(bins) { free(bins); }
|
||||
if(bin_nclusters) { free(bin_nclusters); }
|
||||
if(bin_clusters) { free(bin_clusters); }
|
||||
mbins = mbinx * mbiny;
|
||||
bincount = (int*) malloc(mbins * sizeof(int));
|
||||
bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int));
|
||||
|
||||
if (cluster_bincount) { free(cluster_bincount); }
|
||||
cluster_bincount = (int*) malloc(mbins * sizeof(int));
|
||||
|
||||
if (cluster_bins) { free(cluster_bins); }
|
||||
cluster_bins = (int*) malloc(mbins * clusters_per_bin * sizeof(int));
|
||||
bin_nclusters = (int*) malloc(mbins * sizeof(int));
|
||||
bin_clusters = (int*) malloc(mbins * clusters_per_bin * sizeof(int));
|
||||
|
||||
/*
|
||||
DEBUG_MESSAGE("lo, hi = (%e, %e, %e), (%e, %e, %e)\n", xlo, ylo, zlo, xhi, yhi, zhi);
|
||||
@ -171,20 +159,20 @@ void setupNeighbor(Parameter *param, Atom *atom) {
|
||||
}
|
||||
|
||||
MD_FLOAT getBoundingBoxDistanceSq(Atom *atom, int ci, int cj) {
|
||||
MD_FLOAT dl = atom->clusters[ci].bbminx - atom->clusters[cj].bbmaxx;
|
||||
MD_FLOAT dh = atom->clusters[cj].bbminx - atom->clusters[ci].bbmaxx;
|
||||
MD_FLOAT dl = atom->iclusters[ci].bbminx - atom->jclusters[cj].bbmaxx;
|
||||
MD_FLOAT dh = atom->jclusters[cj].bbminx - atom->iclusters[ci].bbmaxx;
|
||||
MD_FLOAT dm = MAX(dl, dh);
|
||||
MD_FLOAT dm0 = MAX(dm, 0.0);
|
||||
MD_FLOAT d2 = dm0 * dm0;
|
||||
|
||||
dl = atom->clusters[ci].bbminy - atom->clusters[cj].bbmaxy;
|
||||
dh = atom->clusters[cj].bbminy - atom->clusters[ci].bbmaxy;
|
||||
dl = atom->iclusters[ci].bbminy - atom->jclusters[cj].bbmaxy;
|
||||
dh = atom->jclusters[cj].bbminy - atom->iclusters[ci].bbmaxy;
|
||||
dm = MAX(dl, dh);
|
||||
dm0 = MAX(dm, 0.0);
|
||||
d2 += dm0 * dm0;
|
||||
|
||||
dl = atom->clusters[ci].bbminz - atom->clusters[cj].bbmaxz;
|
||||
dh = atom->clusters[cj].bbminz - atom->clusters[ci].bbmaxz;
|
||||
dl = atom->iclusters[ci].bbminz - atom->jclusters[cj].bbmaxz;
|
||||
dh = atom->jclusters[cj].bbminz - atom->iclusters[ci].bbmaxz;
|
||||
dm = MAX(dl, dh);
|
||||
dm0 = MAX(dm, 0.0);
|
||||
d2 += dm0 * dm0;
|
||||
@ -192,14 +180,16 @@ MD_FLOAT getBoundingBoxDistanceSq(Atom *atom, int ci, int cj) {
|
||||
}
|
||||
|
||||
int atomDistanceInRange(Atom *atom, int ci, int cj, MD_FLOAT rsq) {
|
||||
MD_FLOAT *ciptr = cluster_pos_ptr(ci);
|
||||
MD_FLOAT *cjptr = cluster_pos_ptr(cj);
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
|
||||
for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) {
|
||||
for(int cjj = 0; cjj < atom->clusters[cj].natoms; cjj++) {
|
||||
MD_FLOAT delx = cluster_x(ciptr, cii) - cluster_x(cjptr, cjj);
|
||||
MD_FLOAT dely = cluster_y(ciptr, cii) - cluster_y(cjptr, cjj);
|
||||
MD_FLOAT delz = cluster_z(ciptr, cii) - cluster_z(cjptr, cjj);
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
|
||||
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
|
||||
MD_FLOAT delx = ci_x[CL_X_OFFSET + cii] - cj_x[CL_X_OFFSET + cjj];
|
||||
MD_FLOAT dely = ci_x[CL_Y_OFFSET + cii] - cj_x[CL_Y_OFFSET + cjj];
|
||||
MD_FLOAT delz = ci_x[CL_Z_OFFSET + cii] - cj_x[CL_Z_OFFSET + cjj];
|
||||
if(delx * delx + dely * dely + delz * delz < rsq) {
|
||||
return 1;
|
||||
}
|
||||
@ -211,11 +201,10 @@ int atomDistanceInRange(Atom *atom, int ci, int cj, MD_FLOAT rsq) {
|
||||
|
||||
void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
DEBUG_MESSAGE("buildNeighbor start\n");
|
||||
int nall = atom->Nclusters_local + atom->Nclusters_ghost;
|
||||
|
||||
/* extend atom arrays if necessary */
|
||||
if(nall > nmax) {
|
||||
nmax = nall;
|
||||
if(atom->Nclusters_local > nmax) {
|
||||
nmax = atom->Nclusters_local;
|
||||
if(neighbor->numneigh) free(neighbor->numneigh);
|
||||
if(neighbor->neighbors) free(neighbor->neighbors);
|
||||
neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
|
||||
@ -236,20 +225,20 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
|
||||
int n = 0;
|
||||
int ibin = atom->clusters[ci].bin;
|
||||
MD_FLOAT ibb_xmin = atom->clusters[ci].bbminx;
|
||||
MD_FLOAT ibb_xmax = atom->clusters[ci].bbmaxx;
|
||||
MD_FLOAT ibb_ymin = atom->clusters[ci].bbminy;
|
||||
MD_FLOAT ibb_ymax = atom->clusters[ci].bbmaxy;
|
||||
MD_FLOAT ibb_zmin = atom->clusters[ci].bbminz;
|
||||
MD_FLOAT ibb_zmax = atom->clusters[ci].bbmaxz;
|
||||
int ibin = atom->icluster_bin[ci];
|
||||
MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx;
|
||||
MD_FLOAT ibb_xmax = atom->iclusters[ci].bbmaxx;
|
||||
MD_FLOAT ibb_ymin = atom->iclusters[ci].bbminy;
|
||||
MD_FLOAT ibb_ymax = atom->iclusters[ci].bbmaxy;
|
||||
MD_FLOAT ibb_zmin = atom->iclusters[ci].bbminz;
|
||||
MD_FLOAT ibb_zmax = atom->iclusters[ci].bbmaxz;
|
||||
|
||||
for(int k = 0; k < nstencil; k++) {
|
||||
int jbin = ibin + stencil[k];
|
||||
int *loc_bin = &cluster_bins[jbin * clusters_per_bin];
|
||||
int *loc_bin = &bin_clusters[jbin * clusters_per_bin];
|
||||
int cj, m = -1;
|
||||
MD_FLOAT jbb_xmin, jbb_xmax, jbb_ymin, jbb_ymax, jbb_zmin, jbb_zmax;
|
||||
const int c = cluster_bincount[jbin];
|
||||
const int c = bin_nclusters[jbin];
|
||||
|
||||
if(c > 0) {
|
||||
MD_FLOAT dl, dh, dm, dm0, d_bb_sq;
|
||||
@ -257,8 +246,8 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
do {
|
||||
m++;
|
||||
cj = loc_bin[m];
|
||||
jbb_zmin = atom->clusters[cj].bbminz;
|
||||
jbb_zmax = atom->clusters[cj].bbmaxz;
|
||||
jbb_zmin = atom->jclusters[cj].bbminz;
|
||||
jbb_zmax = atom->jclusters[cj].bbmaxz;
|
||||
dl = ibb_zmin - jbb_zmax;
|
||||
dh = jbb_zmin - ibb_zmax;
|
||||
dm = MAX(dl, dh);
|
||||
@ -266,10 +255,10 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
d_bb_sq = dm0 * dm0;
|
||||
} while(m + 1 < c && d_bb_sq > cutneighsq);
|
||||
|
||||
jbb_xmin = atom->clusters[cj].bbminx;
|
||||
jbb_xmax = atom->clusters[cj].bbmaxx;
|
||||
jbb_ymin = atom->clusters[cj].bbminy;
|
||||
jbb_ymax = atom->clusters[cj].bbmaxy;
|
||||
jbb_xmin = atom->jclusters[cj].bbminx;
|
||||
jbb_xmax = atom->jclusters[cj].bbmaxx;
|
||||
jbb_ymin = atom->jclusters[cj].bbminy;
|
||||
jbb_ymax = atom->jclusters[cj].bbmaxy;
|
||||
|
||||
while(m < c) {
|
||||
dl = ibb_zmin - jbb_zmax;
|
||||
@ -303,20 +292,21 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
m++;
|
||||
if(m < c) {
|
||||
cj = loc_bin[m];
|
||||
jbb_xmin = atom->clusters[cj].bbminx;
|
||||
jbb_xmax = atom->clusters[cj].bbmaxx;
|
||||
jbb_ymin = atom->clusters[cj].bbminy;
|
||||
jbb_ymax = atom->clusters[cj].bbmaxy;
|
||||
jbb_zmin = atom->clusters[cj].bbminz;
|
||||
jbb_zmax = atom->clusters[cj].bbmaxz;
|
||||
jbb_xmin = atom->jclusters[cj].bbminx;
|
||||
jbb_xmax = atom->jclusters[cj].bbmaxx;
|
||||
jbb_ymin = atom->jclusters[cj].bbminy;
|
||||
jbb_ymax = atom->jclusters[cj].bbmaxy;
|
||||
jbb_zmin = atom->jclusters[cj].bbminz;
|
||||
jbb_zmax = atom->jclusters[cj].bbmaxz;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(CLUSTER_DIM_N > CLUSTER_DIM_M) {
|
||||
while(n % (CLUSTER_DIM_N / CLUSTER_DIM_M)) {
|
||||
neighptr[n++] = nall - 1; // Last cluster is always a dummy cluster
|
||||
// Fill neighbor list with dummy values to fit vector width
|
||||
if(CLUSTER_N < VECTOR_WIDTH) {
|
||||
while(n % (VECTOR_WIDTH / CLUSTER_N)) {
|
||||
neighptr[n++] = atom->dummy_cj; // Last cluster is always a dummy cluster
|
||||
}
|
||||
}
|
||||
|
||||
@ -341,38 +331,40 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
/*
|
||||
DEBUG_MESSAGE("\ncutneighsq = %f, rbb_sq = %f\n", cutneighsq, rbb_sq);
|
||||
for(int ci = 0; ci < 6; ci++) {
|
||||
MD_FLOAT *ciptr = cluster_pos_ptr(ci);
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
int* neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
|
||||
|
||||
DEBUG_MESSAGE("Cluster %d, bbx = {%f, %f}, bby = {%f, %f}, bbz = {%f, %f}\n",
|
||||
ci,
|
||||
atom->clusters[ci].bbminx,
|
||||
atom->clusters[ci].bbmaxx,
|
||||
atom->clusters[ci].bbminy,
|
||||
atom->clusters[ci].bbmaxy,
|
||||
atom->clusters[ci].bbminz,
|
||||
atom->clusters[ci].bbmaxz);
|
||||
atom->iclusters[ci].bbminx,
|
||||
atom->iclusters[ci].bbmaxx,
|
||||
atom->iclusters[ci].bbminy,
|
||||
atom->iclusters[ci].bbmaxy,
|
||||
atom->iclusters[ci].bbminz,
|
||||
atom->iclusters[ci].bbmaxz);
|
||||
|
||||
for(int cii = 0; cii < CLUSTER_DIM_M; cii++) {
|
||||
DEBUG_MESSAGE("%f, %f, %f\n", cluster_x(ciptr, cii), cluster_y(ciptr, cii), cluster_z(ciptr, cii));
|
||||
for(int cii = 0; cii < CLUSTER_M; cii++) {
|
||||
DEBUG_MESSAGE("%f, %f, %f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("Neighbors:\n");
|
||||
for(int k = 0; k < neighbor->numneigh[ci]; k++) {
|
||||
const int cj = neighptr[k];
|
||||
MD_FLOAT *cjptr = cluster_pos_ptr(cj);
|
||||
int cj = neighptr[k];
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
|
||||
DEBUG_MESSAGE(" Cluster %d, bbx = {%f, %f}, bby = {%f, %f}, bbz = {%f, %f}\n",
|
||||
cj,
|
||||
atom->clusters[cj].bbminx,
|
||||
atom->clusters[cj].bbmaxx,
|
||||
atom->clusters[cj].bbminy,
|
||||
atom->clusters[cj].bbmaxy,
|
||||
atom->clusters[cj].bbminz,
|
||||
atom->clusters[cj].bbmaxz);
|
||||
atom->jclusters[cj].bbminx,
|
||||
atom->jclusters[cj].bbmaxx,
|
||||
atom->jclusters[cj].bbminy,
|
||||
atom->jclusters[cj].bbmaxy,
|
||||
atom->jclusters[cj].bbminz,
|
||||
atom->jclusters[cj].bbmaxz);
|
||||
|
||||
for(int cjj = 0; cjj < CLUSTER_DIM_M; cjj++) {
|
||||
DEBUG_MESSAGE(" %f, %f, %f\n", cluster_x(cjptr, cjj), cluster_y(cjptr, cjj), cluster_z(cjptr, cjj));
|
||||
for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
|
||||
DEBUG_MESSAGE(" %f, %f, %f\n", cj_x[CL_X_OFFSET + cjj], cj_x[CL_Y_OFFSET + cjj], cj_x[CL_Z_OFFSET + cjj]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -383,7 +375,6 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
||||
|
||||
void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
DEBUG_MESSAGE("pruneNeighbor start\n");
|
||||
int nall = atom->Nclusters_local + atom->Nclusters_ghost;
|
||||
//MD_FLOAT cutsq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT cutsq = cutneighsq;
|
||||
|
||||
@ -393,8 +384,8 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
int k = 0;
|
||||
|
||||
// Remove dummy clusters if necessary
|
||||
if(CLUSTER_DIM_N > CLUSTER_DIM_M) {
|
||||
while(neighs[numneighs - 1] == nall - 1) {
|
||||
if(CLUSTER_N < VECTOR_WIDTH) {
|
||||
while(neighs[numneighs - 1] == atom->dummy_cj) {
|
||||
numneighs--;
|
||||
}
|
||||
}
|
||||
@ -410,9 +401,9 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
}
|
||||
|
||||
// Readd dummy clusters if necessary
|
||||
if(CLUSTER_DIM_N > CLUSTER_DIM_M) {
|
||||
while(numneighs % (CLUSTER_DIM_N / CLUSTER_DIM_M)) {
|
||||
neighs[numneighs++] = nall - 1; // Last cluster is always a dummy cluster
|
||||
if(CLUSTER_N < VECTOR_WIDTH) {
|
||||
while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) {
|
||||
neighs[numneighs++] = atom->dummy_cj; // Last cluster is always a dummy cluster
|
||||
}
|
||||
}
|
||||
|
||||
@ -558,34 +549,37 @@ void buildClusters(Atom *atom) {
|
||||
for(int bin = 0; bin < mbins; bin++) {
|
||||
int c = bincount[bin];
|
||||
int ac = 0;
|
||||
const int nclusters = ((c + CLUSTER_DIM_M - 1) / CLUSTER_DIM_M);
|
||||
int nclusters = ((c + CLUSTER_M - 1) / CLUSTER_M);
|
||||
if(CLUSTER_N > CLUSTER_M && nclusters % 2) { nclusters++; }
|
||||
for(int cl = 0; cl < nclusters; cl++) {
|
||||
const int ci = atom->Nclusters_local;
|
||||
|
||||
if(ci >= atom->Nclusters_max) {
|
||||
growClusters(atom);
|
||||
}
|
||||
|
||||
MD_FLOAT *cptr = cluster_pos_ptr(ci);
|
||||
MD_FLOAT *cvptr = cluster_velocity_ptr(ci);
|
||||
int ci_sca_base = CI_SCALAR_BASE_INDEX(ci);
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
|
||||
int *ci_type = &atom->cl_type[ci_sca_base];
|
||||
MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
|
||||
MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
|
||||
MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
|
||||
atom->clusters[ci].natoms = 0;
|
||||
|
||||
for(int cii = 0; cii < CLUSTER_DIM_M; cii++) {
|
||||
atom->iclusters[ci].natoms = 0;
|
||||
for(int cii = 0; cii < CLUSTER_M; cii++) {
|
||||
if(ac < c) {
|
||||
int i = bins[bin * atoms_per_bin + ac];
|
||||
MD_FLOAT xtmp = atom_x(i);
|
||||
MD_FLOAT ytmp = atom_y(i);
|
||||
MD_FLOAT ztmp = atom_z(i);
|
||||
|
||||
cluster_x(cptr, cii) = xtmp;
|
||||
cluster_y(cptr, cii) = ytmp;
|
||||
cluster_z(cptr, cii) = ztmp;
|
||||
cluster_x(cvptr, cii) = atom->vx[i];
|
||||
cluster_y(cvptr, cii) = atom->vy[i];
|
||||
cluster_z(cvptr, cii) = atom->vz[i];
|
||||
ci_x[CL_X_OFFSET + cii] = xtmp;
|
||||
ci_x[CL_Y_OFFSET + cii] = ytmp;
|
||||
ci_x[CL_Z_OFFSET + cii] = ztmp;
|
||||
ci_v[CL_X_OFFSET + cii] = atom->vx[i];
|
||||
ci_v[CL_Y_OFFSET + cii] = atom->vy[i];
|
||||
ci_v[CL_Z_OFFSET + cii] = atom->vz[i];
|
||||
|
||||
// TODO: To create the bounding boxes faster, we can use SIMD operations
|
||||
if(bbminx > xtmp) { bbminx = xtmp; }
|
||||
@ -595,24 +589,24 @@ void buildClusters(Atom *atom) {
|
||||
if(bbminz > ztmp) { bbminz = ztmp; }
|
||||
if(bbmaxz < ztmp) { bbmaxz = ztmp; }
|
||||
|
||||
atom->clusters[ci].type[cii] = atom->type[i];
|
||||
atom->clusters[ci].natoms++;
|
||||
ci_type[cii] = atom->type[i];
|
||||
atom->iclusters[ci].natoms++;
|
||||
} else {
|
||||
cluster_x(cptr, cii) = INFINITY;
|
||||
cluster_y(cptr, cii) = INFINITY;
|
||||
cluster_z(cptr, cii) = INFINITY;
|
||||
ci_x[CL_X_OFFSET + cii] = INFINITY;
|
||||
ci_x[CL_Y_OFFSET + cii] = INFINITY;
|
||||
ci_x[CL_Z_OFFSET + cii] = INFINITY;
|
||||
}
|
||||
|
||||
ac++;
|
||||
}
|
||||
|
||||
atom->clusters[ci].bin = bin;
|
||||
atom->clusters[ci].bbminx = bbminx;
|
||||
atom->clusters[ci].bbmaxx = bbmaxx;
|
||||
atom->clusters[ci].bbminy = bbminy;
|
||||
atom->clusters[ci].bbmaxy = bbmaxy;
|
||||
atom->clusters[ci].bbminz = bbminz;
|
||||
atom->clusters[ci].bbmaxz = bbmaxz;
|
||||
atom->icluster_bin[ci] = bin;
|
||||
atom->iclusters[ci].bbminx = bbminx;
|
||||
atom->iclusters[ci].bbmaxx = bbmaxx;
|
||||
atom->iclusters[ci].bbminy = bbminy;
|
||||
atom->iclusters[ci].bbmaxy = bbmaxy;
|
||||
atom->iclusters[ci].bbminz = bbminz;
|
||||
atom->iclusters[ci].bbmaxz = bbmaxz;
|
||||
atom->Nclusters_local++;
|
||||
}
|
||||
}
|
||||
@ -620,6 +614,94 @@ void buildClusters(Atom *atom) {
|
||||
DEBUG_MESSAGE("buildClusters end\n");
|
||||
}
|
||||
|
||||
void defineJClusters(Atom *atom) {
|
||||
DEBUG_MESSAGE("defineJClusters start\n");
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int cj0 = CJ0_FROM_CI(ci);
|
||||
|
||||
if(CLUSTER_M == CLUSTER_N) {
|
||||
atom->jclusters[cj0].bbminx = atom->iclusters[ci].bbminx;
|
||||
atom->jclusters[cj0].bbmaxx = atom->iclusters[ci].bbmaxx;
|
||||
atom->jclusters[cj0].bbminy = atom->iclusters[ci].bbminy;
|
||||
atom->jclusters[cj0].bbmaxy = atom->iclusters[ci].bbmaxy;
|
||||
atom->jclusters[cj0].bbminz = atom->iclusters[ci].bbminz;
|
||||
atom->jclusters[cj0].bbmaxz = atom->iclusters[ci].bbmaxz;
|
||||
atom->jclusters[cj0].natoms = atom->iclusters[ci].natoms;
|
||||
|
||||
} else if(CLUSTER_M > CLUSTER_N) {
|
||||
int cj1 = CJ1_FROM_CI(ci);
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
|
||||
MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
|
||||
MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
|
||||
|
||||
for(int cii = 0; cii < MAX(atom->iclusters[ci].natoms, CLUSTER_N); cii++) {
|
||||
MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii];
|
||||
MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii];
|
||||
MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii];
|
||||
|
||||
// TODO: To create the bounding boxes faster, we can use SIMD operations
|
||||
if(bbminx > xtmp) { bbminx = xtmp; }
|
||||
if(bbmaxx < xtmp) { bbmaxx = xtmp; }
|
||||
if(bbminy > ytmp) { bbminy = ytmp; }
|
||||
if(bbmaxy < ytmp) { bbmaxy = ytmp; }
|
||||
if(bbminz > ztmp) { bbminz = ztmp; }
|
||||
if(bbmaxz < ztmp) { bbmaxz = ztmp; }
|
||||
}
|
||||
|
||||
atom->jclusters[cj0].bbminx = bbminx;
|
||||
atom->jclusters[cj0].bbmaxx = bbmaxx;
|
||||
atom->jclusters[cj0].bbminy = bbminy;
|
||||
atom->jclusters[cj0].bbmaxy = bbmaxy;
|
||||
atom->jclusters[cj0].bbminz = bbminz;
|
||||
atom->jclusters[cj0].bbmaxz = bbmaxz;
|
||||
atom->jclusters[cj0].natoms = MAX(atom->iclusters[ci].natoms, CLUSTER_N);
|
||||
|
||||
bbminx = INFINITY, bbmaxx = -INFINITY;
|
||||
bbminy = INFINITY, bbmaxy = -INFINITY;
|
||||
bbminz = INFINITY, bbmaxz = -INFINITY;
|
||||
|
||||
for(int cii = CLUSTER_N; cii < atom->iclusters[ci].natoms; cii++) {
|
||||
MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii];
|
||||
MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii];
|
||||
MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii];
|
||||
|
||||
// TODO: To create the bounding boxes faster, we can use SIMD operations
|
||||
if(bbminx > xtmp) { bbminx = xtmp; }
|
||||
if(bbmaxx < xtmp) { bbmaxx = xtmp; }
|
||||
if(bbminy > ytmp) { bbminy = ytmp; }
|
||||
if(bbmaxy < ytmp) { bbmaxy = ytmp; }
|
||||
if(bbminz > ztmp) { bbminz = ztmp; }
|
||||
if(bbmaxz < ztmp) { bbmaxz = ztmp; }
|
||||
}
|
||||
|
||||
atom->jclusters[cj1].bbminx = bbminx;
|
||||
atom->jclusters[cj1].bbmaxx = bbmaxx;
|
||||
atom->jclusters[cj1].bbminy = bbminy;
|
||||
atom->jclusters[cj1].bbmaxy = bbmaxy;
|
||||
atom->jclusters[cj1].bbminz = bbminz;
|
||||
atom->jclusters[cj1].bbmaxz = bbmaxz;
|
||||
atom->jclusters[cj1].natoms = MIN(0, atom->iclusters[ci].natoms - CLUSTER_N);
|
||||
|
||||
} else {
|
||||
if(ci % 2 == 0) {
|
||||
const int ci1 = ci + 1;
|
||||
atom->jclusters[cj0].bbminx = MIN(atom->iclusters[ci].bbminx, atom->iclusters[ci1].bbminx);
|
||||
atom->jclusters[cj0].bbmaxx = MAX(atom->iclusters[ci].bbmaxx, atom->iclusters[ci1].bbmaxx);
|
||||
atom->jclusters[cj0].bbminy = MIN(atom->iclusters[ci].bbminy, atom->iclusters[ci1].bbminy);
|
||||
atom->jclusters[cj0].bbmaxy = MAX(atom->iclusters[ci].bbmaxy, atom->iclusters[ci1].bbmaxy);
|
||||
atom->jclusters[cj0].bbminz = MIN(atom->iclusters[ci].bbminz, atom->iclusters[ci1].bbminz);
|
||||
atom->jclusters[cj0].bbmaxz = MAX(atom->iclusters[ci].bbmaxz, atom->iclusters[ci1].bbmaxz);
|
||||
atom->jclusters[cj0].natoms = atom->iclusters[ci].natoms + atom->iclusters[ci1].natoms;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("defineJClusters end\n");
|
||||
}
|
||||
|
||||
void binClusters(Atom *atom) {
|
||||
DEBUG_MESSAGE("binClusters start\n");
|
||||
|
||||
@ -629,7 +711,7 @@ void binClusters(Atom *atom) {
|
||||
MD_FLOAT *cptr = cluster_pos_ptr(ci);
|
||||
DEBUG_MESSAGE("Cluster %d:\n", ci);
|
||||
DEBUG_MESSAGE("bin=%d, Natoms=%d, bbox={%f,%f},{%f,%f},{%f,%f}\n",
|
||||
atom->clusters[ci].bin,
|
||||
atom->icluster_bin[ci],
|
||||
atom->clusters[ci].natoms,
|
||||
atom->clusters[ci].bbminx,
|
||||
atom->clusters[ci].bbmaxx,
|
||||
@ -638,49 +720,65 @@ void binClusters(Atom *atom) {
|
||||
atom->clusters[ci].bbminz,
|
||||
atom->clusters[ci].bbmaxz);
|
||||
|
||||
for(int cii = 0; cii < CLUSTER_DIM_M; cii++) {
|
||||
for(int cii = 0; cii < CLUSTER_M; cii++) {
|
||||
DEBUG_MESSAGE("%f, %f, %f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii));
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
const int nlocal = atom->Nclusters_local;
|
||||
int resize = 1;
|
||||
const int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
|
||||
const int ncj = atom->Nclusters_local / jfac;
|
||||
|
||||
int resize = 1;
|
||||
while(resize > 0) {
|
||||
resize = 0;
|
||||
|
||||
for(int bin = 0; bin < mbins; bin++) {
|
||||
cluster_bincount[bin] = 0;
|
||||
bin_nclusters[bin] = 0;
|
||||
}
|
||||
|
||||
for(int ci = 0; ci < nlocal && !resize; ci++) {
|
||||
int bin = atom->clusters[ci].bin;
|
||||
int c = cluster_bincount[bin];
|
||||
if(c < clusters_per_bin) {
|
||||
cluster_bins[bin * clusters_per_bin + c] = ci;
|
||||
cluster_bincount[bin]++;
|
||||
// Assure we add this j-cluster only once in the bin
|
||||
if(!(CLUSTER_M < CLUSTER_N && ci % 2)) {
|
||||
int bin = atom->icluster_bin[ci];
|
||||
int c = bin_nclusters[bin];
|
||||
if(c + 1 < clusters_per_bin) {
|
||||
bin_clusters[bin * clusters_per_bin + c] = CJ0_FROM_CI(ci);
|
||||
bin_nclusters[bin]++;
|
||||
|
||||
if(CLUSTER_M > CLUSTER_N) {
|
||||
int cj1 = CJ1_FROM_CI(ci);
|
||||
if(atom->jclusters[cj1].natoms > 0) {
|
||||
bin_clusters[bin * clusters_per_bin + c + 1] = cj1;
|
||||
bin_nclusters[bin]++;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
resize = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for(int cg = 0; cg < atom->Nclusters_ghost && !resize; cg++) {
|
||||
const int ci = nlocal + cg;
|
||||
MD_FLOAT *cptr = cluster_pos_ptr(ci);
|
||||
MD_FLOAT ci_minz = atom->clusters[ci].bbminz;
|
||||
MD_FLOAT xtmp, ytmp;
|
||||
const int cj = ncj + cg;
|
||||
int ix = -1, iy = -1;
|
||||
MD_FLOAT xtmp, ytmp;
|
||||
|
||||
xtmp = cluster_x(cptr, 0);
|
||||
ytmp = cluster_y(cptr, 0);
|
||||
if(atom->jclusters[cj].natoms > 0) {
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_FLOAT cj_minz = atom->jclusters[cj].bbminz;
|
||||
|
||||
xtmp = cj_x[CL_X_OFFSET + 0];
|
||||
ytmp = cj_x[CL_Y_OFFSET + 0];
|
||||
coord2bin2D(xtmp, ytmp, &ix, &iy);
|
||||
ix = MAX(MIN(ix, mbinx - 1), 0);
|
||||
iy = MAX(MIN(iy, mbiny - 1), 0);
|
||||
for(int cii = 1; cii < atom->clusters[ci].natoms; cii++) {
|
||||
for(int cjj = 1; cjj < atom->jclusters[cj].natoms; cjj++) {
|
||||
int nix, niy;
|
||||
xtmp = cluster_x(cptr, cii);
|
||||
ytmp = cluster_y(cptr, cii);
|
||||
xtmp = cj_x[CL_X_OFFSET + cjj];
|
||||
ytmp = cj_x[CL_Y_OFFSET + cjj];
|
||||
coord2bin2D(xtmp, ytmp, &nix, &niy);
|
||||
nix = MAX(MIN(nix, mbinx - 1), 0);
|
||||
niy = MAX(MIN(niy, mbiny - 1), 0);
|
||||
@ -694,19 +792,19 @@ void binClusters(Atom *atom) {
|
||||
}
|
||||
|
||||
int bin = iy * mbinx + ix + 1;
|
||||
int c = cluster_bincount[bin];
|
||||
int c = bin_nclusters[bin];
|
||||
if(c < clusters_per_bin) {
|
||||
// Insert the current ghost cluster in the bin keeping clusters
|
||||
// sorted by z coordinate
|
||||
int inserted = 0;
|
||||
for(int i = 0; i < c; i++) {
|
||||
int last_cl = cluster_bins[bin * clusters_per_bin + i];
|
||||
if(atom->clusters[last_cl].bbminz > ci_minz) {
|
||||
cluster_bins[bin * clusters_per_bin + i] = ci;
|
||||
int last_cl = bin_clusters[bin * clusters_per_bin + i];
|
||||
if(atom->jclusters[last_cl].bbminz > cj_minz) {
|
||||
bin_clusters[bin * clusters_per_bin + i] = cj;
|
||||
|
||||
for(int j = i + 1; j <= c; j++) {
|
||||
int tmp = cluster_bins[bin * clusters_per_bin + j];
|
||||
cluster_bins[bin * clusters_per_bin + j] = last_cl;
|
||||
int tmp = bin_clusters[bin * clusters_per_bin + j];
|
||||
bin_clusters[bin * clusters_per_bin + j] = last_cl;
|
||||
last_cl = tmp;
|
||||
}
|
||||
|
||||
@ -716,26 +814,26 @@ void binClusters(Atom *atom) {
|
||||
}
|
||||
|
||||
if(!inserted) {
|
||||
cluster_bins[bin * clusters_per_bin + c] = ci;
|
||||
bin_clusters[bin * clusters_per_bin + c] = cj;
|
||||
}
|
||||
|
||||
atom->clusters[ci].bin = bin;
|
||||
cluster_bincount[bin]++;
|
||||
bin_nclusters[bin]++;
|
||||
} else {
|
||||
resize = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(resize) {
|
||||
free(cluster_bins);
|
||||
free(bin_clusters);
|
||||
clusters_per_bin *= 2;
|
||||
cluster_bins = (int*) malloc(mbins * clusters_per_bin * sizeof(int));
|
||||
bin_clusters = (int*) malloc(mbins * clusters_per_bin * sizeof(int));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
DEBUG_MESSAGE("cluster_bincount\n");
|
||||
for(int i = 0; i < mbins; i++) { DEBUG_MESSAGE("%d, ", cluster_bincount[i]); }
|
||||
DEBUG_MESSAGE("bin_nclusters\n");
|
||||
for(int i = 0; i < mbins; i++) { DEBUG_MESSAGE("%d, ", bin_nclusters[i]); }
|
||||
DEBUG_MESSAGE("\n");
|
||||
*/
|
||||
|
||||
@ -747,16 +845,17 @@ void updateSingleAtoms(Atom *atom) {
|
||||
int Natom = 0;
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
MD_FLOAT *cptr = cluster_pos_ptr(ci);
|
||||
MD_FLOAT *cvptr = cluster_velocity_ptr(ci);
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
|
||||
|
||||
for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) {
|
||||
atom_x(Natom) = cluster_x(cptr, cii);
|
||||
atom_y(Natom) = cluster_y(cptr, cii);
|
||||
atom_z(Natom) = cluster_z(cptr, cii);
|
||||
atom->vx[Natom] = cluster_x(cvptr, cii);
|
||||
atom->vy[Natom] = cluster_y(cvptr, cii);
|
||||
atom->vz[Natom] = cluster_z(cvptr, cii);
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
|
||||
atom_x(Natom) = ci_x[CL_X_OFFSET + cii];
|
||||
atom_y(Natom) = ci_x[CL_Y_OFFSET + cii];
|
||||
atom_z(Natom) = ci_x[CL_Z_OFFSET + cii];
|
||||
atom->vx[Natom] = ci_v[CL_X_OFFSET + cii];
|
||||
atom->vy[Natom] = ci_v[CL_Y_OFFSET + cii];
|
||||
atom->vz[Natom] = ci_v[CL_Z_OFFSET + cii];
|
||||
Natom++;
|
||||
}
|
||||
}
|
||||
|
105
gromacs/pbc.c
105
gromacs/pbc.c
@ -28,6 +28,7 @@
|
||||
#include <atom.h>
|
||||
#include <allocate.h>
|
||||
#include <neighbor.h>
|
||||
#include <util.h>
|
||||
|
||||
#define DELTA 20000
|
||||
|
||||
@ -45,28 +46,31 @@ void initPbc(Atom* atom) {
|
||||
/* update coordinates of ghost atoms */
|
||||
/* uses mapping created in setupPbc */
|
||||
void updatePbc(Atom *atom, Parameter *param, int firstUpdate) {
|
||||
int *border_map = atom->border_map;
|
||||
int nlocal = atom->Nclusters_local;
|
||||
DEBUG_MESSAGE("updatePbc start\n");
|
||||
int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
|
||||
int ncj = atom->Nclusters_local / jfac;
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
|
||||
for(int cg = 0; cg < atom->Nclusters_ghost; cg++) {
|
||||
const int ci = nlocal + cg;
|
||||
MD_FLOAT *cptr = cluster_pos_ptr(ci);
|
||||
MD_FLOAT *bmap_cptr = cluster_pos_ptr(border_map[cg]);
|
||||
const int cj = ncj + cg;
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
int bmap_vec_base = CJ_VECTOR_BASE_INDEX(atom->border_map[cg]);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
MD_FLOAT *bmap_x = &atom->cl_x[bmap_vec_base];
|
||||
MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
|
||||
MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
|
||||
MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
|
||||
|
||||
for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) {
|
||||
MD_FLOAT xtmp = cluster_x(bmap_cptr, cii) + atom->PBCx[cg] * xprd;
|
||||
MD_FLOAT ytmp = cluster_y(bmap_cptr, cii) + atom->PBCy[cg] * yprd;
|
||||
MD_FLOAT ztmp = cluster_z(bmap_cptr, cii) + atom->PBCz[cg] * zprd;
|
||||
for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) {
|
||||
MD_FLOAT xtmp = bmap_x[CL_X_OFFSET + cjj] + atom->PBCx[cg] * xprd;
|
||||
MD_FLOAT ytmp = bmap_x[CL_Y_OFFSET + cjj] + atom->PBCy[cg] * yprd;
|
||||
MD_FLOAT ztmp = bmap_x[CL_Z_OFFSET + cjj] + atom->PBCz[cg] * zprd;
|
||||
|
||||
cluster_x(cptr, cii) = xtmp;
|
||||
cluster_y(cptr, cii) = ytmp;
|
||||
cluster_z(cptr, cii) = ztmp;
|
||||
cj_x[CL_X_OFFSET + cjj] = xtmp;
|
||||
cj_x[CL_Y_OFFSET + cjj] = ytmp;
|
||||
cj_x[CL_Z_OFFSET + cjj] = ztmp;
|
||||
|
||||
if(firstUpdate) {
|
||||
// TODO: To create the bounding boxes faster, we can use SIMD operations
|
||||
@ -80,20 +84,22 @@ void updatePbc(Atom *atom, Parameter *param, int firstUpdate) {
|
||||
}
|
||||
|
||||
if(firstUpdate) {
|
||||
for(int cii = atom->clusters[ci].natoms; cii < CLUSTER_DIM_M; cii++) {
|
||||
cluster_x(cptr, cii) = INFINITY;
|
||||
cluster_y(cptr, cii) = INFINITY;
|
||||
cluster_z(cptr, cii) = INFINITY;
|
||||
for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) {
|
||||
cj_x[CL_X_OFFSET + cjj] = INFINITY;
|
||||
cj_x[CL_Y_OFFSET + cjj] = INFINITY;
|
||||
cj_x[CL_Z_OFFSET + cjj] = INFINITY;
|
||||
}
|
||||
|
||||
atom->clusters[ci].bbminx = bbminx;
|
||||
atom->clusters[ci].bbmaxx = bbmaxx;
|
||||
atom->clusters[ci].bbminy = bbminy;
|
||||
atom->clusters[ci].bbmaxy = bbmaxy;
|
||||
atom->clusters[ci].bbminz = bbminz;
|
||||
atom->clusters[ci].bbmaxz = bbmaxz;
|
||||
atom->jclusters[cj].bbminx = bbminx;
|
||||
atom->jclusters[cj].bbmaxx = bbmaxx;
|
||||
atom->jclusters[cj].bbminy = bbminy;
|
||||
atom->jclusters[cj].bbmaxy = bbmaxy;
|
||||
atom->jclusters[cj].bbminz = bbminz;
|
||||
atom->jclusters[cj].bbmaxz = bbmaxz;
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("updatePbc end\n");
|
||||
}
|
||||
|
||||
/* relocate atoms that have left domain according
|
||||
@ -130,15 +136,18 @@ void updateAtomsPbc(Atom *atom, Parameter *param) {
|
||||
* that are then enforced in updatePbc */
|
||||
#define ADDGHOST(dx,dy,dz); \
|
||||
Nghost++; \
|
||||
const int g_atom_idx = atom->Nclusters_local + Nghost; \
|
||||
border_map[Nghost] = ci; \
|
||||
const int cg = ncj + Nghost; \
|
||||
const int cj_natoms = atom->jclusters[cj].natoms; \
|
||||
atom->border_map[Nghost] = cj; \
|
||||
atom->PBCx[Nghost] = dx; \
|
||||
atom->PBCy[Nghost] = dy; \
|
||||
atom->PBCz[Nghost] = dz; \
|
||||
atom->clusters[g_atom_idx].natoms = atom->clusters[ci].natoms; \
|
||||
Nghost_atoms += atom->clusters[g_atom_idx].natoms; \
|
||||
for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) { \
|
||||
atom->clusters[g_atom_idx].type[cii] = atom->clusters[ci].type[cii]; \
|
||||
atom->jclusters[cg].natoms = cj_natoms; \
|
||||
Nghost_atoms += cj_natoms; \
|
||||
int cj_sca_base = CJ_SCALAR_BASE_INDEX(cj); \
|
||||
int cg_sca_base = CJ_SCALAR_BASE_INDEX(cg); \
|
||||
for(int cjj = 0; cjj < cj_natoms; cjj++) { \
|
||||
atom->cl_type[cg_sca_base + cjj] = atom->cl_type[cj_sca_base + cjj]; \
|
||||
}
|
||||
|
||||
/* internal subroutines */
|
||||
@ -153,30 +162,32 @@ void growPbc(Atom* atom) {
|
||||
}
|
||||
|
||||
void setupPbc(Atom *atom, Parameter *param) {
|
||||
int *border_map = atom->border_map;
|
||||
DEBUG_MESSAGE("setupPbc start\n");
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
MD_FLOAT Cutneigh = param->cutneigh;
|
||||
int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
|
||||
int ncj = atom->Nclusters_local / jfac;
|
||||
int Nghost = -1;
|
||||
int Nghost_atoms = 0;
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
if (atom->Nclusters_local + Nghost + 7 >= atom->Nclusters_max) {
|
||||
for(int cj = 0; cj < ncj; cj++) {
|
||||
if(atom->jclusters[cj].natoms > 0) {
|
||||
if(atom->Nclusters_local + (Nghost + 7) * jfac >= atom->Nclusters_max) {
|
||||
growClusters(atom);
|
||||
}
|
||||
|
||||
if (Nghost + 7 >= NmaxGhost) {
|
||||
if((Nghost + 7) * jfac >= NmaxGhost) {
|
||||
growPbc(atom);
|
||||
border_map = atom->border_map;
|
||||
}
|
||||
|
||||
MD_FLOAT bbminx = atom->clusters[ci].bbminx;
|
||||
MD_FLOAT bbmaxx = atom->clusters[ci].bbmaxx;
|
||||
MD_FLOAT bbminy = atom->clusters[ci].bbminy;
|
||||
MD_FLOAT bbmaxy = atom->clusters[ci].bbmaxy;
|
||||
MD_FLOAT bbminz = atom->clusters[ci].bbminz;
|
||||
MD_FLOAT bbmaxz = atom->clusters[ci].bbmaxz;
|
||||
MD_FLOAT bbminx = atom->jclusters[cj].bbminx;
|
||||
MD_FLOAT bbmaxx = atom->jclusters[cj].bbmaxx;
|
||||
MD_FLOAT bbminy = atom->jclusters[cj].bbminy;
|
||||
MD_FLOAT bbmaxy = atom->jclusters[cj].bbmaxy;
|
||||
MD_FLOAT bbminz = atom->jclusters[cj].bbminz;
|
||||
MD_FLOAT bbmaxz = atom->jclusters[cj].bbmaxz;
|
||||
|
||||
/* Setup ghost atoms */
|
||||
/* 6 planes */
|
||||
@ -209,24 +220,28 @@ void setupPbc(Atom *atom, Parameter *param) {
|
||||
if (bbmaxy >= (yprd-Cutneigh) && bbminx < Cutneigh) { ADDGHOST(+1,-1,0); }
|
||||
if (bbmaxy >= (yprd-Cutneigh) && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); }
|
||||
}
|
||||
}
|
||||
|
||||
if(atom->Nclusters_local + Nghost + 1 >= atom->Nclusters_max) {
|
||||
if(ncj + (Nghost + 1) * jfac >= atom->Nclusters_max) {
|
||||
growClusters(atom);
|
||||
}
|
||||
|
||||
// Add dummy cluster at the end
|
||||
MD_FLOAT *cptr = cluster_pos_ptr(atom->Nclusters_local + Nghost + 1);
|
||||
for(int cii = 0; cii < CLUSTER_DIM_M; cii++) {
|
||||
cluster_x(cptr, cii) = INFINITY;
|
||||
cluster_y(cptr, cii) = INFINITY;
|
||||
cluster_z(cptr, cii) = INFINITY;
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(ncj + Nghost + 1);
|
||||
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
|
||||
for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
|
||||
cj_x[CL_X_OFFSET + cjj] = INFINITY;
|
||||
cj_x[CL_Y_OFFSET + cjj] = INFINITY;
|
||||
cj_x[CL_Z_OFFSET + cjj] = INFINITY;
|
||||
}
|
||||
|
||||
// increase by one to make it the ghost atom count
|
||||
atom->dummy_cj = ncj + Nghost + 1;
|
||||
atom->Nghost = Nghost_atoms;
|
||||
atom->Nclusters_ghost = Nghost + 1;
|
||||
atom->Nclusters = atom->Nclusters_local + Nghost + 1;
|
||||
|
||||
// Update created ghost clusters positions
|
||||
updatePbc(atom, param, 1);
|
||||
DEBUG_MESSAGE("setupPbc end\n");
|
||||
}
|
||||
|
@ -18,11 +18,11 @@ void initStats(Stats *s) {
|
||||
void displayStatistics(Atom *atom, Parameter *param, Stats *stats, double *timer) {
|
||||
#ifdef COMPUTE_STATS
|
||||
|
||||
const int MxN = CLUSTER_DIM_M * CLUSTER_DIM_N;
|
||||
const int MxN = CLUSTER_M * CLUSTER_N;
|
||||
double avg_atoms_cluster = (double)(atom->Nlocal) / (double)(atom->Nclusters_local);
|
||||
double force_useful_volume = 1e-9 * ( (double)(atom->Nlocal * (param->ntimes + 1)) * (sizeof(MD_FLOAT) * 6 + sizeof(int)) +
|
||||
(double)(stats->num_neighs) * (sizeof(MD_FLOAT) * 3 + sizeof(int)) );
|
||||
double avg_neigh_atom = (stats->num_neighs * CLUSTER_DIM_N) / (double)(atom->Nlocal * (param->ntimes + 1));
|
||||
double avg_neigh_atom = (stats->num_neighs * CLUSTER_N) / (double)(atom->Nlocal * (param->ntimes + 1));
|
||||
double avg_neigh_cluster = (double)(stats->num_neighs) / (double)(stats->calculated_forces);
|
||||
double avg_simd = stats->force_iters / (double)(atom->Nlocal * (param->ntimes + 1));
|
||||
|
||||
|
@ -27,9 +27,10 @@ int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep
|
||||
fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
|
||||
fprintf(fp, "POINTS %d double\n", atom->Nlocal);
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ++ci) {
|
||||
MD_FLOAT *cptr = cluster_pos_ptr(ci);
|
||||
for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
|
||||
fprintf(fp, "%.4f %.4f %.4f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii));
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
|
||||
fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
|
||||
}
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
@ -70,9 +71,10 @@ int write_ghost_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep
|
||||
fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
|
||||
fprintf(fp, "POINTS %d double\n", atom->Nghost);
|
||||
for(int ci = atom->Nclusters_local; ci < atom->Nclusters_local + atom->Nclusters_ghost; ++ci) {
|
||||
MD_FLOAT *cptr = cluster_pos_ptr(ci);
|
||||
for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
|
||||
fprintf(fp, "%.4f %.4f %.4f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii));
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
|
||||
fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
|
||||
}
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
@ -116,18 +118,19 @@ int write_local_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int
|
||||
fprintf(fp, "DATASET POLYDATA\n");
|
||||
fprintf(fp, "POINTS %d double\n", atom->Nlocal);
|
||||
for(int ci = 0; ci < N; ++ci) {
|
||||
MD_FLOAT *cptr = cluster_pos_ptr(ci);
|
||||
for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
|
||||
fprintf(fp, "%.4f %.4f %.4f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii));
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
|
||||
fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
|
||||
}
|
||||
|
||||
tot_lines += atom->clusters[ci].natoms;
|
||||
tot_lines += atom->iclusters[ci].natoms;
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fprintf(fp, "LINES %d %d\n", N, N + tot_lines);
|
||||
for(int ci = 0; ci < N; ++ci) {
|
||||
fprintf(fp, "%d ", atom->clusters[ci].natoms);
|
||||
for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
|
||||
fprintf(fp, "%d ", atom->iclusters[ci].natoms);
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
|
||||
fprintf(fp, "%d ", i++);
|
||||
}
|
||||
|
||||
@ -157,18 +160,19 @@ int write_ghost_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int
|
||||
fprintf(fp, "DATASET POLYDATA\n");
|
||||
fprintf(fp, "POINTS %d double\n", atom->Nghost);
|
||||
for(int ci = atom->Nclusters_local; ci < N; ++ci) {
|
||||
MD_FLOAT *cptr = cluster_pos_ptr(ci);
|
||||
for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
|
||||
fprintf(fp, "%.4f %.4f %.4f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii));
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
|
||||
fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
|
||||
}
|
||||
|
||||
tot_lines += atom->clusters[ci].natoms;
|
||||
tot_lines += atom->iclusters[ci].natoms;
|
||||
}
|
||||
fprintf(fp, "\n\n");
|
||||
fprintf(fp, "LINES %d %d\n", atom->Nclusters_ghost, atom->Nclusters_ghost + tot_lines);
|
||||
for(int ci = atom->Nclusters_local; ci < N; ++ci) {
|
||||
fprintf(fp, "%d ", atom->clusters[ci].natoms);
|
||||
for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
|
||||
fprintf(fp, "%d ", atom->iclusters[ci].natoms);
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) {
|
||||
fprintf(fp, "%d ", i++);
|
||||
}
|
||||
|
||||
|
@ -52,11 +52,12 @@ void xtc_init(const char *filename, Atom *atom, int timestep) {
|
||||
void xtc_write(Atom *atom, int timestep, int write_pos, int write_vel) {
|
||||
int i = 0;
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ++ci) {
|
||||
MD_FLOAT *cptr = cluster_pos_ptr(ci);
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) {
|
||||
x_buf[i][XX] = cluster_x(cptr, cii);
|
||||
x_buf[i][YY] = cluster_y(cptr, cii);
|
||||
x_buf[i][ZZ] = cluster_z(cptr, cii);
|
||||
x_buf[i][XX] = ci_x[CL_X_OFFSET + cii];
|
||||
x_buf[i][YY] = ci_x[CL_Y_OFFSET + cii];
|
||||
x_buf[i][ZZ] = ci_x[CL_Z_OFFSET + cii];
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
18
include_ISA.mk
Normal file
18
include_ISA.mk
Normal file
@ -0,0 +1,18 @@
|
||||
ifeq ($(strip $(ISA)), SSE)
|
||||
_VECTOR_WIDTH=2
|
||||
else ifeq ($(strip $(ISA)), AVX)
|
||||
# Vector width is 4 but AVX2 instruction set is not supported
|
||||
NO_AVX2=true
|
||||
_VECTOR_WIDTH=4
|
||||
else ifeq ($(strip $(ISA)), AVX2)
|
||||
_VECTOR_WIDTH=4
|
||||
else ifeq ($(strip $(ISA)), AVX512)
|
||||
AVX512=true
|
||||
_VECTOR_WIDTH=8
|
||||
endif
|
||||
|
||||
ifeq ($(strip $(DATA_TYPE)), SP)
|
||||
VECTOR_WIDTH=$(shell echo $$(( $(_VECTOR_WIDTH) * 2 )))
|
||||
else
|
||||
VECTOR_WIDTH=$(_VECTOR_WIDTH)
|
||||
endif
|
@ -217,8 +217,6 @@ int main(int argc, const char *argv[]) {
|
||||
|
||||
if(!csv) {
|
||||
printf("Number of timesteps: %d\n", param.ntimes);
|
||||
printf("Number of times to compute the atoms loop: %d\n", ATOMS_LOOP_RUNS);
|
||||
printf("Number of times to compute the neighbors loop: %d\n", NEIGHBORS_LOOP_RUNS);
|
||||
printf("System size (unit cells): %dx%dx%d\n", param.nx, param.ny, param.nz);
|
||||
printf("Atoms per unit cell: %d\n", atoms_per_unit_cell);
|
||||
printf("Total number of atoms: %d\n", atom->Nlocal);
|
||||
@ -257,9 +255,8 @@ int main(int argc, const char *argv[]) {
|
||||
E = getTimeStamp();
|
||||
double T_accum = E-S;
|
||||
double freq_hz = param.proc_freq * 1.e9;
|
||||
const double repeats = ATOMS_LOOP_RUNS * NEIGHBORS_LOOP_RUNS;
|
||||
const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes * repeats);
|
||||
const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes * repeats) * freq_hz;
|
||||
const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes);
|
||||
const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes) * freq_hz;
|
||||
const double cycles_per_neigh = cycles_per_atom / (double)(atoms_per_unit_cell - 1);
|
||||
|
||||
if(!csv) {
|
||||
|
Loading…
Reference in New Issue
Block a user