diff --git a/Makefile b/Makefile index e214557..9d3cc90 100644 --- a/Makefile +++ b/Makefile @@ -10,15 +10,13 @@ Q ?= @ include $(MAKE_DIR)/config.mk include $(MAKE_DIR)/include_$(TAG).mk include $(MAKE_DIR)/include_LIKWID.mk +include $(MAKE_DIR)/include_ISA.mk include $(MAKE_DIR)/include_GROMACS.mk INCLUDES += -I./$(SRC_DIR)/includes ifeq ($(strip $(DATA_LAYOUT)),AOS) DEFINES += -DAOS endif -ifeq ($(strip $(CLUSTER_LAYOUT)),AOS) -DEFINES += -DCLUSTER_AOS -endif ifeq ($(strip $(DATA_TYPE)),SP) DEFINES += -DPRECISION=1 else @@ -29,14 +27,6 @@ ifneq ($(ASM_SYNTAX), ATT) ASFLAGS += -masm=intel endif -ifneq ($(ATOMS_LOOP_RUNS),) - DEFINES += -DATOMS_LOOP_RUNS=$(ATOMS_LOOP_RUNS) -endif - -ifneq ($(NEIGHBORS_LOOP_RUNS),) - DEFINES += -DNEIGHBORS_LOOP_RUNS=$(NEIGHBORS_LOOP_RUNS) -endif - ifeq ($(strip $(EXPLICIT_TYPES)),true) DEFINES += -DEXPLICIT_TYPES endif @@ -73,6 +63,10 @@ ifeq ($(strip $(NO_AVX2)),true) DEFINES += -DNO_AVX2 endif +ifeq ($(strip $(AVX512)),true) + DEFINES += -DAVX512 +endif + VPATH = $(SRC_DIR) $(ASM_DIR) ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c)) OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s)) diff --git a/config.mk b/config.mk index 3e8bb88..9afe7a0 100644 --- a/config.mk +++ b/config.mk @@ -1,11 +1,13 @@ # Compiler tag (GCC/CLANG/ICC) TAG ?= ICC +# Instruction set (SSE/AVX/AVX2/AVX512) +ISA ?= AVX512 # Optimization scheme (lammps/gromacs/clusters_per_bin) OPT_SCHEME ?= gromacs # Enable likwid (true or false) ENABLE_LIKWID ?= true # SP or DP -DATA_TYPE ?= DP +DATA_TYPE ?= SP # AOS or SOA DATA_LAYOUT ?= AOS # Assembly syntax to generate (ATT/INTEL) @@ -13,26 +15,16 @@ ASM_SYNTAX ?= ATT # Debug DEBUG ?= false -# Number of times to run the atoms loop on stubbed variant -ATOMS_LOOP_RUNS ?= 1 -# Number of times to run the neighbors loop on stubbed variant -NEIGHBORS_LOOP_RUNS ?= 1 # Explicitly store and load atom types (true or false) EXPLICIT_TYPES ?= false # Trace memory addresses for cache simulator (true or false) MEM_TRACER ?= false # Trace indexes and distances for gather-md (true or false) INDEX_TRACER ?= false -# Vector width (elements) for index and distance tracer -VECTOR_WIDTH ?= 8 -# When vector width is 4 but AVX2 is not supported (AVX only), set this to true -NO_AVX2 ?= false # Compute statistics COMPUTE_STATS ?= true # Configurations for gromacs optimization scheme -# AOS or SOA -CLUSTER_LAYOUT ?= SOA # Use reference version USE_REFERENCE_VERSION ?= false # Enable XTC output diff --git a/gromacs/atom.c b/gromacs/atom.c index 8c11da9..88ee0f8 100644 --- a/gromacs/atom.c +++ b/gromacs/atom.c @@ -37,6 +37,7 @@ void initAtom(Atom *atom) { atom->cl_x = NULL; atom->cl_v = NULL; atom->cl_f = NULL; + atom->cl_type = NULL; atom->Natoms = 0; atom->Nlocal = 0; atom->Nghost = 0; @@ -51,7 +52,9 @@ void initAtom(Atom *atom) { atom->sigma6 = NULL; atom->cutforcesq = NULL; atom->cutneighsq = NULL; - atom->clusters = NULL; + atom->iclusters = NULL; + atom->jclusters = NULL; + atom->icluster_bin = NULL; } void createAtom(Atom *atom, Parameter *param) { @@ -426,9 +429,13 @@ void growAtom(Atom *atom) { void growClusters(Atom *atom) { int nold = atom->Nclusters_max; + int jterm = MAX(1, CLUSTER_M / CLUSTER_N); // If M>N, we need to allocate more j-clusters atom->Nclusters_max += DELTA; - atom->clusters = (Cluster*) reallocate(atom->clusters, ALIGNMENT, atom->Nclusters_max * sizeof(Cluster), nold * sizeof(Cluster)); - atom->cl_x = (MD_FLOAT*) reallocate(atom->cl_x, ALIGNMENT, atom->Nclusters_max * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT)); - atom->cl_f = (MD_FLOAT*) reallocate(atom->cl_f, ALIGNMENT, atom->Nclusters_max * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT)); - atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_DIM_M * 3 * sizeof(MD_FLOAT)); + atom->iclusters = (Cluster*) reallocate(atom->iclusters, ALIGNMENT, atom->Nclusters_max * sizeof(Cluster), nold * sizeof(Cluster)); + atom->jclusters = (Cluster*) reallocate(atom->jclusters, ALIGNMENT, atom->Nclusters_max * jterm * sizeof(Cluster), nold * jterm * sizeof(Cluster)); + atom->icluster_bin = (int*) reallocate(atom->icluster_bin, ALIGNMENT, atom->Nclusters_max * sizeof(int), nold * sizeof(int)); + atom->cl_x = (MD_FLOAT*) reallocate(atom->cl_x, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT)); + atom->cl_f = (MD_FLOAT*) reallocate(atom->cl_f, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT)); + atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT)); + atom->cl_type = (int*) reallocate(atom->cl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * sizeof(int), nold * CLUSTER_M * sizeof(int)); } diff --git a/gromacs/force_lj.c b/gromacs/force_lj.c index 07ed0e5..afdc311 100644 --- a/gromacs/force_lj.c +++ b/gromacs/force_lj.c @@ -41,11 +41,12 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_FLOAT epsilon = param->epsilon; for(int ci = 0; ci < atom->Nclusters_local; ci++) { - MD_FLOAT *fptr = cluster_force_ptr(ci); - for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) { - cluster_x(fptr, cii) = 0.0; - cluster_y(fptr, cii) = 0.0; - cluster_z(fptr, cii) = 0.0; + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; + for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) { + ci_f[CL_X_OFFSET + cii] = 0.0; + ci_f[CL_Y_OFFSET + cii] = 0.0; + ci_f[CL_Z_OFFSET + cii] = 0.0; } } @@ -54,28 +55,39 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat #pragma omp parallel for for(int ci = 0; ci < atom->Nclusters_local; ci++) { - MD_FLOAT *ciptr = cluster_pos_ptr(ci); - MD_FLOAT *cifptr = cluster_force_ptr(ci); + int ci_cj0 = CJ0_FROM_CI(ci); + int ci_cj1 = CJ1_FROM_CI(ci); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; neighs = &neighbor->neighbors[ci * neighbor->maxneighs]; int numneighs = neighbor->numneigh[ci]; for(int k = 0; k < numneighs; k++) { int cj = neighs[k]; + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); int any = 0; - MD_FLOAT *cjptr = cluster_pos_ptr(cj); - for(int cii = 0; cii < CLUSTER_DIM_M; cii++) { - MD_FLOAT xtmp = cluster_x(ciptr, cii); - MD_FLOAT ytmp = cluster_y(ciptr, cii); - MD_FLOAT ztmp = cluster_z(ciptr, cii); + MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; + + for(int cii = 0; cii < CLUSTER_M; cii++) { + MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii]; + MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii]; + MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii]; MD_FLOAT fix = 0; MD_FLOAT fiy = 0; MD_FLOAT fiz = 0; - for(int cjj = 0; cjj < CLUSTER_DIM_M; cjj++) { - if(ci != cj || cii != cjj) { - MD_FLOAT delx = xtmp - cluster_x(cjptr, cjj); - MD_FLOAT dely = ytmp - cluster_y(cjptr, cjj); - MD_FLOAT delz = ztmp - cluster_z(cjptr, cjj); + for(int cjj = 0; cjj < CLUSTER_N; cjj++) { + #if CLUSTER_M == CLUSTER_N + if(ci_cj0 != cj || cii != cjj) { + #elif CLUSTER_M < CLUSTER_N + if(ci_cj0 != cj || cii + CLUSTER_M * (ci & 0x1) != cjj) { + #else + if((ci_cj0 != cj || cii != cjj) && (ci_cj1 != cj || cii != cjj + CLUSTER_N)) { + #endif + MD_FLOAT delx = xtmp - cj_x[CL_X_OFFSET + cjj]; + MD_FLOAT dely = ytmp - cj_x[CL_Y_OFFSET + cjj]; + MD_FLOAT delz = ztmp - cj_x[CL_Z_OFFSET + cjj]; MD_FLOAT rsq = delx * delx + dely * dely + delz * delz; if(rsq < cutforcesq) { MD_FLOAT sr2 = 1.0 / rsq; @@ -98,15 +110,15 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat addStat(stats->clusters_outside_cutoff, 1); } - cluster_x(cifptr, cii) += fix; - cluster_y(cifptr, cii) += fiy; - cluster_z(cifptr, cii) += fiz; + ci_f[CL_X_OFFSET + cii] += fix; + ci_f[CL_Y_OFFSET + cii] += fiy; + ci_f[CL_Z_OFFSET + cii] += fiz; } } addStat(stats->calculated_forces, 1); addStat(stats->num_neighs, numneighs); - addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_DIM_M / CLUSTER_DIM_N)); + addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N)); } LIKWID_MARKER_STOP("force"); @@ -115,6 +127,132 @@ double computeForceLJ_ref(Parameter *param, Atom *atom, Neighbor *neighbor, Stat return E-S; } +double computeForceLJ_2xnn(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { + DEBUG_MESSAGE("computeForceLJ_2xnn begin\n"); + int Nlocal = atom->Nlocal; + int* neighs; + MD_FLOAT cutforcesq = param->cutforce * param->cutforce; + MD_FLOAT sigma6 = param->sigma6; + MD_FLOAT epsilon = param->epsilon; + MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq); + MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6); + MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon); + MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0); + MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5); + const unsigned int half_mask_bits = VECTOR_WIDTH >> 1; + + for(int ci = 0; ci < atom->Nclusters_local; ci++) { + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; + for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) { + ci_f[CL_X_OFFSET + cii] = 0.0; + ci_f[CL_Y_OFFSET + cii] = 0.0; + ci_f[CL_Z_OFFSET + cii] = 0.0; + } + } + + double S = getTimeStamp(); + LIKWID_MARKER_START("force"); + + #pragma omp parallel for + for(int ci = 0; ci < atom->Nclusters_local; ci++) { + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; + neighs = &neighbor->neighbors[ci * neighbor->maxneighs]; + int numneighs = neighbor->numneigh[ci]; + + MD_SIMD_FLOAT xi0_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 0]); + MD_SIMD_FLOAT xi2_tmp = simd_load_h_dual(&ci_x[CL_X_OFFSET + 2]); + MD_SIMD_FLOAT yi0_tmp = simd_load_h_dual(&ci_x[CL_Y_OFFSET + 0]); + MD_SIMD_FLOAT yi2_tmp = simd_load_h_dual(&ci_x[CL_Y_OFFSET + 2]); + MD_SIMD_FLOAT zi0_tmp = simd_load_h_dual(&ci_x[CL_Z_OFFSET + 0]); + MD_SIMD_FLOAT zi2_tmp = simd_load_h_dual(&ci_x[CL_Z_OFFSET + 2]); + MD_SIMD_FLOAT fix0 = simd_zero(); + MD_SIMD_FLOAT fiy0 = simd_zero(); + MD_SIMD_FLOAT fiz0 = simd_zero(); + MD_SIMD_FLOAT fix2 = simd_zero(); + MD_SIMD_FLOAT fiy2 = simd_zero(); + MD_SIMD_FLOAT fiz2 = simd_zero(); + + for(int k = 0; k < numneighs; k++) { + int cj = neighs[k]; + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; + unsigned int mask0, mask1, mask2, mask3; + + MD_SIMD_FLOAT xj_tmp = simd_load_h_duplicate(&cj_x[CL_X_OFFSET]); + MD_SIMD_FLOAT yj_tmp = simd_load_h_duplicate(&cj_x[CL_Y_OFFSET]); + MD_SIMD_FLOAT zj_tmp = simd_load_h_duplicate(&cj_x[CL_Z_OFFSET]); + MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp); + MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp); + MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp); + MD_SIMD_FLOAT delx2 = simd_sub(xi2_tmp, xj_tmp); + MD_SIMD_FLOAT dely2 = simd_sub(yi2_tmp, yj_tmp); + MD_SIMD_FLOAT delz2 = simd_sub(zi2_tmp, zj_tmp); + + #if CLUSTER_M == CLUSTER_N + unsigned int cond0 = (unsigned int)(cj == ci_cj0); + mask0 = (unsigned int)(0xf - 0x1 * cond0); + mask1 = (unsigned int)(0xf - 0x2 * cond0); + mask2 = (unsigned int)(0xf - 0x4 * cond0); + mask3 = (unsigned int)(0xf - 0x8 * cond0); + #elif CLUSTER_M < CLUSTER_N + unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci); + unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci); + mask0 = (unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1); + mask1 = (unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1); + mask2 = (unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1); + mask3 = (unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1); + #else + unsigned int cond0 = (unsigned int)(cj == ci_cj0); + unsigned int cond1 = (unsigned int)(cj == ci_cj1); + mask0 = (unsigned int)(0x3 - 0x1 * cond0); + mask1 = (unsigned int)(0x3 - 0x2 * cond0); + mask2 = (unsigned int)(0x3 - 0x1 * cond1); + mask3 = (unsigned int)(0x3 - 0x2 * cond1); + #endif + + MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((mask1 << half_mask_bits) | mask0); + MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((mask3 << half_mask_bits) | mask2); + + MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0))); + MD_SIMD_FLOAT rsq2 = simd_fma(delx2, delx2, simd_fma(dely2, dely2, simd_mul(delz2, delz2))); + + MD_SIMD_MASK cutoff_mask0 = simd_mask_and(excl_mask0, simd_mask_cond_lt(rsq0, cutforcesq_vec)); + MD_SIMD_MASK cutoff_mask2 = simd_mask_and(excl_mask2, simd_mask_cond_lt(rsq2, cutforcesq_vec)); + + MD_SIMD_FLOAT sr2_0 = simd_reciprocal(rsq0); + MD_SIMD_FLOAT sr2_2 = simd_reciprocal(rsq2); + + MD_SIMD_FLOAT sr6_0 = simd_mul(sr2_0, simd_mul(sr2_0, simd_mul(sr2_0, sigma6_vec))); + MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec))); + + MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec)))); + MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec)))); + + fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0); + fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0); + fiz0 = simd_masked_add(fiz0, simd_mul(delz0, force0), cutoff_mask0); + fix2 = simd_masked_add(fix2, simd_mul(delx2, force2), cutoff_mask2); + fiy2 = simd_masked_add(fiy2, simd_mul(dely2, force2), cutoff_mask2); + fiz2 = simd_masked_add(fiz2, simd_mul(delz2, force2), cutoff_mask2); + } + + simd_h_dual_reduce_sum(&ci_f[CL_X_OFFSET], fix0, fix2); + simd_h_dual_reduce_sum(&ci_f[CL_Y_OFFSET], fiy0, fiy2); + simd_h_dual_reduce_sum(&ci_f[CL_Z_OFFSET], fiz0, fiz2); + + addStat(stats->calculated_forces, 1); + addStat(stats->num_neighs, numneighs); + addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N)); + } + + LIKWID_MARKER_STOP("force"); + double E = getTimeStamp(); + DEBUG_MESSAGE("computeForceLJ_2xnn end\n"); + return E-S; +} double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { DEBUG_MESSAGE("computeForceLJ_4xn begin\n"); @@ -125,33 +263,36 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_FLOAT epsilon = param->epsilon; MD_SIMD_FLOAT cutforcesq_vec = simd_broadcast(cutforcesq); MD_SIMD_FLOAT sigma6_vec = simd_broadcast(sigma6); - MD_SIMD_FLOAT epsilon_vec = simd_broadcast(epsilon); + MD_SIMD_FLOAT eps_vec = simd_broadcast(epsilon); MD_SIMD_FLOAT c48_vec = simd_broadcast(48.0); MD_SIMD_FLOAT c05_vec = simd_broadcast(0.5); - const int unroll_j = CLUSTER_DIM_N / CLUSTER_DIM_M; - double S = getTimeStamp(); LIKWID_MARKER_START("force"); #pragma omp parallel for for(int ci = 0; ci < atom->Nclusters_local; ci++) { - MD_FLOAT *ciptr = cluster_pos_ptr(ci); - MD_FLOAT *cifptr = cluster_force_ptr(ci); + int ci_cj0 = CJ0_FROM_CI(ci); + #if CLUSTER_M > CLUSTER_N + int ci_cj1 = CJ1_FROM_CI(ci); + #endif + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; neighs = &neighbor->neighbors[ci * neighbor->maxneighs]; int numneighs = neighbor->numneigh[ci]; - MD_SIMD_FLOAT xi0_tmp = simd_broadcast(cluster_x(ciptr, 0)); - MD_SIMD_FLOAT yi0_tmp = simd_broadcast(cluster_y(ciptr, 0)); - MD_SIMD_FLOAT zi0_tmp = simd_broadcast(cluster_z(ciptr, 0)); - MD_SIMD_FLOAT xi1_tmp = simd_broadcast(cluster_x(ciptr, 1)); - MD_SIMD_FLOAT yi1_tmp = simd_broadcast(cluster_y(ciptr, 1)); - MD_SIMD_FLOAT zi1_tmp = simd_broadcast(cluster_z(ciptr, 1)); - MD_SIMD_FLOAT xi2_tmp = simd_broadcast(cluster_x(ciptr, 2)); - MD_SIMD_FLOAT yi2_tmp = simd_broadcast(cluster_y(ciptr, 2)); - MD_SIMD_FLOAT zi2_tmp = simd_broadcast(cluster_z(ciptr, 2)); - MD_SIMD_FLOAT xi3_tmp = simd_broadcast(cluster_x(ciptr, 3)); - MD_SIMD_FLOAT yi3_tmp = simd_broadcast(cluster_y(ciptr, 3)); - MD_SIMD_FLOAT zi3_tmp = simd_broadcast(cluster_z(ciptr, 3)); + MD_SIMD_FLOAT xi0_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 0]); + MD_SIMD_FLOAT xi1_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 1]); + MD_SIMD_FLOAT xi2_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 2]); + MD_SIMD_FLOAT xi3_tmp = simd_broadcast(ci_x[CL_X_OFFSET + 3]); + MD_SIMD_FLOAT yi0_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 0]); + MD_SIMD_FLOAT yi1_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 1]); + MD_SIMD_FLOAT yi2_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 2]); + MD_SIMD_FLOAT yi3_tmp = simd_broadcast(ci_x[CL_Y_OFFSET + 3]); + MD_SIMD_FLOAT zi0_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 0]); + MD_SIMD_FLOAT zi1_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 1]); + MD_SIMD_FLOAT zi2_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 2]); + MD_SIMD_FLOAT zi3_tmp = simd_broadcast(ci_x[CL_Z_OFFSET + 3]); MD_SIMD_FLOAT fix0 = simd_zero(); MD_SIMD_FLOAT fiy0 = simd_zero(); MD_SIMD_FLOAT fiz0 = simd_zero(); @@ -165,24 +306,13 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_SIMD_FLOAT fiy3 = simd_zero(); MD_SIMD_FLOAT fiz3 = simd_zero(); - for(int k = 0; k < numneighs; k += unroll_j) { - int cj0 = neighs[k + 0]; - unsigned int cond0 = (unsigned int)(ci == cj0); - MD_FLOAT *cj0_ptr = cluster_pos_ptr(cj0); - - #if VECTOR_WIDTH == 8 - int cj1 = neighs[k + 1]; - unsigned int cond1 = (unsigned int)(ci == cj1); - MD_FLOAT *cj1_ptr = cluster_pos_ptr(cj1); - MD_SIMD_FLOAT xj_tmp = simd_load2(cj0_ptr, cj1_ptr, 0); - MD_SIMD_FLOAT yj_tmp = simd_load2(cj0_ptr, cj1_ptr, 1); - MD_SIMD_FLOAT zj_tmp = simd_load2(cj0_ptr, cj1_ptr, 2); - #else - MD_SIMD_FLOAT xj_tmp = simd_load(cj0_ptr, 0); - MD_SIMD_FLOAT yj_tmp = simd_load(cj0_ptr, 1); - MD_SIMD_FLOAT zj_tmp = simd_load(cj0_ptr, 2); - #endif - + for(int k = 0; k < numneighs; k++) { + int cj = neighs[k]; + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; + MD_SIMD_FLOAT xj_tmp = simd_load(&cj_x[CL_X_OFFSET]); + MD_SIMD_FLOAT yj_tmp = simd_load(&cj_x[CL_Y_OFFSET]); + MD_SIMD_FLOAT zj_tmp = simd_load(&cj_x[CL_Z_OFFSET]); MD_SIMD_FLOAT delx0 = simd_sub(xi0_tmp, xj_tmp); MD_SIMD_FLOAT dely0 = simd_sub(yi0_tmp, yj_tmp); MD_SIMD_FLOAT delz0 = simd_sub(zi0_tmp, zj_tmp); @@ -196,16 +326,26 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_SIMD_FLOAT dely3 = simd_sub(yi3_tmp, yj_tmp); MD_SIMD_FLOAT delz3 = simd_sub(zi3_tmp, zj_tmp); - #if VECTOR_WIDTH == 8 + #if CLUSTER_M == CLUSTER_N + unsigned int cond0 = (unsigned int)(cj == ci_cj0); + MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0)); + MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0)); + MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0)); + MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0)); + #elif CLUSTER_M < CLUSTER_N + unsigned int cond0 = (unsigned int)((cj << 1) + 0 == ci); + unsigned int cond1 = (unsigned int)((cj << 1) + 1 == ci); MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xff - 0x1 * cond0 - 0x10 * cond1)); MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xff - 0x2 * cond0 - 0x20 * cond1)); MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xff - 0x4 * cond0 - 0x40 * cond1)); MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xff - 0x8 * cond0 - 0x80 * cond1)); #else - MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0xf - 0x1 * cond0)); - MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0xf - 0x2 * cond0)); - MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0xf - 0x4 * cond0)); - MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0xf - 0x8 * cond0)); + unsigned int cond0 = (unsigned int)(cj == ci_cj0); + unsigned int cond1 = (unsigned int)(cj == ci_cj1); + MD_SIMD_MASK excl_mask0 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond0)); + MD_SIMD_MASK excl_mask1 = simd_mask_from_u32((unsigned int)(0x3 - 0x2 * cond0)); + MD_SIMD_MASK excl_mask2 = simd_mask_from_u32((unsigned int)(0x3 - 0x1 * cond1)); + MD_SIMD_MASK excl_mask3 = simd_mask_from_u32((unsigned int)(0x3 - 0x2 * cond1)); #endif MD_SIMD_FLOAT rsq0 = simd_fma(delx0, delx0, simd_fma(dely0, dely0, simd_mul(delz0, delz0))); @@ -228,10 +368,10 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat MD_SIMD_FLOAT sr6_2 = simd_mul(sr2_2, simd_mul(sr2_2, simd_mul(sr2_2, sigma6_vec))); MD_SIMD_FLOAT sr6_3 = simd_mul(sr2_3, simd_mul(sr2_3, simd_mul(sr2_3, sigma6_vec))); - MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, epsilon_vec)))); - MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, epsilon_vec)))); - MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, epsilon_vec)))); - MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, epsilon_vec)))); + MD_SIMD_FLOAT force0 = simd_mul(c48_vec, simd_mul(sr6_0, simd_mul(simd_sub(sr6_0, c05_vec), simd_mul(sr2_0, eps_vec)))); + MD_SIMD_FLOAT force1 = simd_mul(c48_vec, simd_mul(sr6_1, simd_mul(simd_sub(sr6_1, c05_vec), simd_mul(sr2_1, eps_vec)))); + MD_SIMD_FLOAT force2 = simd_mul(c48_vec, simd_mul(sr6_2, simd_mul(simd_sub(sr6_2, c05_vec), simd_mul(sr2_2, eps_vec)))); + MD_SIMD_FLOAT force3 = simd_mul(c48_vec, simd_mul(sr6_3, simd_mul(simd_sub(sr6_3, c05_vec), simd_mul(sr2_3, eps_vec)))); fix0 = simd_masked_add(fix0, simd_mul(delx0, force0), cutoff_mask0); fiy0 = simd_masked_add(fiy0, simd_mul(dely0, force0), cutoff_mask0); @@ -247,22 +387,22 @@ double computeForceLJ_4xn(Parameter *param, Atom *atom, Neighbor *neighbor, Stat fiz3 = simd_masked_add(fiz3, simd_mul(delz3, force3), cutoff_mask3); } - cluster_x(cifptr, 0) = simd_horizontal_sum(fix0); - cluster_y(cifptr, 0) = simd_horizontal_sum(fiy0); - cluster_z(cifptr, 0) = simd_horizontal_sum(fiz0); - cluster_x(cifptr, 1) = simd_horizontal_sum(fix1); - cluster_y(cifptr, 1) = simd_horizontal_sum(fiy1); - cluster_z(cifptr, 1) = simd_horizontal_sum(fiz1); - cluster_x(cifptr, 2) = simd_horizontal_sum(fix2); - cluster_y(cifptr, 2) = simd_horizontal_sum(fiy2); - cluster_z(cifptr, 2) = simd_horizontal_sum(fiz2); - cluster_x(cifptr, 3) = simd_horizontal_sum(fix3); - cluster_y(cifptr, 3) = simd_horizontal_sum(fiy3); - cluster_z(cifptr, 3) = simd_horizontal_sum(fiz3); + ci_f[CL_X_OFFSET + 0] = simd_h_reduce_sum(fix0); + ci_f[CL_X_OFFSET + 1] = simd_h_reduce_sum(fix1); + ci_f[CL_X_OFFSET + 2] = simd_h_reduce_sum(fix2); + ci_f[CL_X_OFFSET + 3] = simd_h_reduce_sum(fix3); + ci_f[CL_Y_OFFSET + 0] = simd_h_reduce_sum(fiy0); + ci_f[CL_Y_OFFSET + 1] = simd_h_reduce_sum(fiy1); + ci_f[CL_Y_OFFSET + 2] = simd_h_reduce_sum(fiy2); + ci_f[CL_Y_OFFSET + 3] = simd_h_reduce_sum(fiy3); + ci_f[CL_Z_OFFSET + 0] = simd_h_reduce_sum(fiz0); + ci_f[CL_Z_OFFSET + 1] = simd_h_reduce_sum(fiz1); + ci_f[CL_Z_OFFSET + 2] = simd_h_reduce_sum(fiz2); + ci_f[CL_Z_OFFSET + 3] = simd_h_reduce_sum(fiz3); addStat(stats->calculated_forces, 1); addStat(stats->num_neighs, numneighs); - addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_DIM_M / CLUSTER_DIM_N)); + addStat(stats->force_iters, (long long int)((double)numneighs * CLUSTER_M / CLUSTER_N)); } LIKWID_MARKER_STOP("force"); diff --git a/gromacs/includes/atom.h b/gromacs/includes/atom.h index 854d0b0..6590419 100644 --- a/gromacs/includes/atom.h +++ b/gromacs/includes/atom.h @@ -27,13 +27,71 @@ #define DELTA 20000 -#define CLUSTER_DIM_M 4 -#define CLUSTER_DIM_N VECTOR_WIDTH +#define CLUSTER_M 4 + +// Nbnxn layouts (as of GROMACS): +// Simd4xN: M=4, N=VECTOR_WIDTH +// Simd2xNN: M=4, N=(VECTOR_WIDTH/2) + +// Simd2xNN (here used for single-precision) +#if VECTOR_WIDTH > CLUSTER_M * 2 +# define KERNEL_NAME "Simd2xNN" +# define CLUSTER_N (VECTOR_WIDTH / 2) +# define computeForceLJ computeForceLJ_2xnn +// Simd4xN +#else +# define KERNEL_NAME "Simd4xN" +# define CLUSTER_N VECTOR_WIDTH +# define computeForceLJ computeForceLJ_4xn +#endif + +#ifdef USE_REFERENCE_VERSION +# undef KERNEL_NAME +# undef computeForceLJ +# define KERNEL_NAME "Reference" +# define computeForceLJ computeForceLJ_ref +#endif + +#if CLUSTER_M == CLUSTER_N +# define CJ0_FROM_CI(a) (a) +# define CJ1_FROM_CI(a) (a) +# define CI_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b)) +# define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b)) +#elif CLUSTER_M == CLUSTER_N * 2 // M > N +# define CJ0_FROM_CI(a) ((a) << 1) +# define CJ1_FROM_CI(a) (((a) << 1) | 0x1) +# define CI_BASE_INDEX(a,b) ((a) * CLUSTER_M * (b)) +# define CJ_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_M * (b) + ((a) & 0x1) * (CLUSTER_M >> 1)) +#elif CLUSTER_M == CLUSTER_N / 2 // M < N +# define CJ0_FROM_CI(a) ((a) >> 1) +# define CJ1_FROM_CI(a) ((a) >> 1) +# define CI_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_N * (b) + ((a) & 0x1) * (CLUSTER_N >> 1)) +# define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b)) +#else +# error "Invalid cluster configuration!" +#endif + +#if CLUSTER_N != 2 && CLUSTER_N != 4 && CLUSTER_N != 8 +# error "Cluster N dimension can be only 2, 4 and 8" +#endif + +#define CI_SCALAR_BASE_INDEX(a) (CI_BASE_INDEX(a, 1)) +#define CI_VECTOR_BASE_INDEX(a) (CI_BASE_INDEX(a, 3)) +#define CJ_SCALAR_BASE_INDEX(a) (CJ_BASE_INDEX(a, 1)) +#define CJ_VECTOR_BASE_INDEX(a) (CJ_BASE_INDEX(a, 3)) + +#if CLUSTER_M >= CLUSTER_N +# define CL_X_OFFSET (0 * CLUSTER_M) +# define CL_Y_OFFSET (1 * CLUSTER_M) +# define CL_Z_OFFSET (2 * CLUSTER_M) +#else +# define CL_X_OFFSET (0 * CLUSTER_N) +# define CL_Y_OFFSET (1 * CLUSTER_N) +# define CL_Z_OFFSET (2 * CLUSTER_N) +#endif typedef struct { - int bin; int natoms; - int type[CLUSTER_DIM_M]; MD_FLOAT bbminx, bbmaxx; MD_FLOAT bbminy, bbmaxy; MD_FLOAT bbminz, bbmaxz; @@ -44,9 +102,6 @@ typedef struct { int Nclusters, Nclusters_local, Nclusters_ghost, Nclusters_max; MD_FLOAT *x, *y, *z; MD_FLOAT *vx, *vy, *vz; - MD_FLOAT *cl_x; - MD_FLOAT *cl_v; - MD_FLOAT *cl_f; int *border_map; int *type; int ntypes; @@ -54,8 +109,15 @@ typedef struct { MD_FLOAT *sigma6; MD_FLOAT *cutforcesq; MD_FLOAT *cutneighsq; - Cluster *clusters; int *PBCx, *PBCy, *PBCz; + // Data in cluster format + MD_FLOAT *cl_x; + MD_FLOAT *cl_v; + MD_FLOAT *cl_f; + int *cl_type; + Cluster *iclusters, *jclusters; + int *icluster_bin; + int dummy_cj; } Atom; extern void initAtom(Atom*); @@ -67,10 +129,6 @@ extern int readAtom_dmp(Atom*, Parameter*); extern void growAtom(Atom*); extern void growClusters(Atom*); -#define cluster_pos_ptr(ci) &(atom->cl_x[(ci) * CLUSTER_DIM_M * 3]) -#define cluster_velocity_ptr(ci) &(atom->cl_v[(ci) * CLUSTER_DIM_M * 3]) -#define cluster_force_ptr(ci) &(atom->cl_f[(ci) * CLUSTER_DIM_M * 3]) - #ifdef AOS #define POS_DATA_LAYOUT "AoS" #define atom_x(i) atom->x[(i) * 3 + 0] @@ -83,14 +141,4 @@ extern void growClusters(Atom*); #define atom_z(i) atom->z[i] #endif -#ifdef CLUSTER_AOS -#define cluster_x(cptr, i) cptr[(i) * 3 + 0] -#define cluster_y(cptr, i) cptr[(i) * 3 + 1] -#define cluster_z(cptr, i) cptr[(i) * 3 + 2] -#else -#define cluster_x(cptr, i) cptr[0 * CLUSTER_DIM_M + (i)] -#define cluster_y(cptr, i) cptr[1 * CLUSTER_DIM_M + (i)] -#define cluster_z(cptr, i) cptr[2 * CLUSTER_DIM_M + (i)] -#endif - #endif diff --git a/gromacs/includes/neighbor.h b/gromacs/includes/neighbor.h index d1fbb5f..0ba00c0 100644 --- a/gromacs/includes/neighbor.h +++ b/gromacs/includes/neighbor.h @@ -40,6 +40,7 @@ extern void buildNeighbor(Atom*, Neighbor*); extern void pruneNeighbor(Parameter*, Atom*, Neighbor*); extern void sortAtom(Atom*); extern void buildClusters(Atom*); +extern void defineJClusters(Atom*); extern void binClusters(Atom*); extern void updateSingleAtoms(Atom*); #endif diff --git a/gromacs/includes/simd.h b/gromacs/includes/simd.h index ad86e7b..59a044a 100644 --- a/gromacs/includes/simd.h +++ b/gromacs/includes/simd.h @@ -21,6 +21,7 @@ * ======================================================================================= */ +#include #include #include #include @@ -28,7 +29,9 @@ #define SIMD_PRINT_REAL(a) simd_print_real(#a, a); #define SIMD_PRINT_MASK(a) simd_print_mask(#a, a); -#if VECTOR_WIDTH == 8 // AVX512 +#ifdef AVX512 + +#if PRECISION == 2 // Double precision #define MD_SIMD_FLOAT __m512d #define MD_SIMD_MASK __mmask8 @@ -45,30 +48,100 @@ static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { retur static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ); } static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); } static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); } +static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_pd(p); } -static MD_SIMD_FLOAT simd_load2(MD_FLOAT *c0, MD_FLOAT *c1, int d) { - MD_SIMD_FLOAT x; -#ifdef CLUSTER_AOS - __m256i aos_gather_vindex = _mm256_set_epi32(9, 6, 3, 0, 9, 6, 3, 0); - __m256i vindex = _mm256_add_epi32(aos_gather_vindex, _mm256_set1_epi32(d)); - x = _mm512_mask_i32gather_pd(simd_zero(), simd_mask_from_u32(0x0f), vindex, c0, sizeof(double)); - x = _mm512_mask_i32gather_pd(x, simd_mask_from_u32(0xf0), vindex, c1, sizeof(double)); -#else - x = _mm512_load_pd(&c0[d * CLUSTER_DIM_M]); - x = _mm512_insertf64x4(x, _mm256_load_pd(&c1[d * CLUSTER_DIM_M]), 1); -#endif - return x; -} - -static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) { +static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) { MD_SIMD_FLOAT x = _mm512_add_pd(a, _mm512_shuffle_f64x2(a, a, 0xee)); x = _mm512_add_pd(x, _mm512_shuffle_f64x2(x, x, 0x11)); x = _mm512_add_pd(x, _mm512_permute_pd(x, 0x01)); return *((MD_FLOAT *) &x); } +static inline MD_SIMD_FLOAT simd_load_h_duplicate(const double* m) { + return _mm512_broadcast_f64x4(_mm256_load_pd(m)); +} + +static inline MD_SIMD_FLOAT simd_load_h_dual(const double* m) { + return _mm512_insertf64x4(_mm512_broadcastsd_pd(_mm_load_sd(m)), _mm256_broadcastsd_pd(_mm_load_sd(m + 1)), 1); +} + +static inline double simd_h_dual_reduce_sum(double* m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) { + __m512d t0; + __m256d t2, t3; + + t0 = _mm512_add_pd(v0, _mm512_permutex_pd(v0, 0x4e)); + t0 = _mm512_mask_add_pd(t0, simd_mask_from_u32(0xccul), v1, _mm512_permutex_pd(v1, 0x4e)); + t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1)); + t0 = _mm512_mask_shuffle_f64x2(t0, simd_mask_from_u32(0xaaul), t0, t0, 0xee); + t2 = _mm512_castpd512_pd256(t0); + t3 = _mm256_load_pd(m); + t3 = _mm256_add_pd(t3, t2); + _mm256_store_pd(m, t3); + + t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0x4e)); + t0 = _mm512_add_pd(t0, _mm512_permutex_pd(t0, 0xb1)); + return _mm_cvtsd_f64(_mm512_castpd512_pd128(t0)); +} + +#else // Single-precision + +#define MD_SIMD_FLOAT __m512 +#define MD_SIMD_MASK __mmask16 + +static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm512_set1_ps(scalar); } +static inline MD_SIMD_FLOAT simd_zero() { return _mm512_set1_ps(0.0f); } +static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_add_ps(a, b); } +static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_sub_ps(a, b); } +static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_mul_ps(a, b); } +static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm512_fmadd_ps(a, b, c); } +static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm512_rcp14_ps(a); } +static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm512_mask_add_ps(a, m, a, b); } +static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask16(a, b); } +static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ); } +static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask16(a); } +static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask16_u32(a); } +static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm512_load_ps(p); } +static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) { + // This would only be called in a Mx16 configuration, which is not valid in GROMACS + fprintf(stderr, "simd_h_reduce_sum(): Called with AVX512 intrinsics and single-precision which is not valid!\n"); + exit(-1); + return 0.0; +} + +static inline MD_SIMD_FLOAT simd_load_h_duplicate(const float* m) { + return _mm512_castpd_ps(_mm512_broadcast_f64x4(_mm256_load_pd((const double *)(m)))); +} + +static inline MD_SIMD_FLOAT simd_load_h_dual(const float* m) { + return _mm512_shuffle_f32x4(_mm512_broadcastss_ps(_mm_load_ss(m)), _mm512_broadcastss_ps(_mm_load_ss(m + 1)), 0x44); +} + +static inline MD_FLOAT simd_h_dual_reduce_sum(float* m, MD_SIMD_FLOAT v0, MD_SIMD_FLOAT v1) { + __m512 t0, t1; + __m128 t2, t3; + + t0 = _mm512_shuffle_f32x4(v0, v1, 0x88); + t1 = _mm512_shuffle_f32x4(v0, v1, 0xdd); + t0 = _mm512_add_ps(t0, t1); + t0 = _mm512_add_ps(t0, _mm512_permute_ps(t0, 0x4e)); + t0 = _mm512_add_ps(t0, _mm512_permute_ps(t0, 0xb1)); + t0 = _mm512_maskz_compress_ps(simd_mask_from_u32(0x1111ul), t0); + t3 = _mm512_castps512_ps128(t0); + t2 = _mm_load_ps(m); + t2 = _mm_add_ps(t2, t3); + _mm_store_ps(m, t2); + + t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0x4e)); + t3 = _mm_add_ps(t3, _mm_permute_ps(t3, 0xb1)); + return _mm_cvtss_f32(t3); +} + +#endif // PRECISION + #else // AVX or AVX2 +#if PRECISION == 2 // Double precision + #define MD_SIMD_FLOAT __m256d #ifdef NO_AVX2 @@ -82,8 +155,10 @@ static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_pd(0.0); } static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_pd(a, b); } static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_pd(a, b); } static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_pd(a, b); } +static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_pd(p); } #ifdef NO_AVX2 + static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_cvtps_pd(_mm_rcp_ps(_mm256_cvtpd_ps(a))); } static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return simd_add(simd_mul(a, b), c); } static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return simd_add(a, _mm256_and_pd(b, m)); } @@ -97,7 +172,7 @@ static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { } // TODO: Implement this, althrough it is just required for debugging static inline int simd_mask_to_u32(MD_SIMD_MASK a) { return 0; } -static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) { +static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) { __m128d a0, a1; a = _mm256_add_pd(a, _mm256_permute_pd(a, 0b0101)); a0 = _mm256_castpd256_pd128(a); @@ -105,7 +180,9 @@ static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) { a0 = _mm_add_sd(a0, a1); return *((MD_FLOAT *) &a0); } -#else + +#else // AVX2 + static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_pd(a); } static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_pd(a, b, c); } static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_pd(a, m, a, b); } @@ -113,7 +190,7 @@ static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); } static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); } static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); } -static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) { +static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) { __m128d a0, a1; // test with shuffle & add as an alternative to hadd later a = _mm256_hadd_pd(a, a); @@ -122,24 +199,52 @@ static inline MD_FLOAT simd_horizontal_sum(MD_SIMD_FLOAT a) { a0 = _mm_add_sd(a0, a1); return *((MD_FLOAT *) &a0); } + #endif -static MD_SIMD_FLOAT simd_load(MD_FLOAT *c0, int d) { - MD_SIMD_FLOAT x; -#ifdef CLUSTER_AOS - #ifdef NO_AVX2 - #error "Not possible to use AoS cluster layout without AVX2 support!" - #endif - __m128i aos_gather_vindex = _mm128_set_epi32(9, 6, 3, 0); - __m128i vindex = _mm128_add_epi32(aos_gather_vindex, _mm128_set1_epi32(d)); - x = _mm256_i32gather_pd(c0, vindex, sizeof(double)); +#else // Single-precision + +#define MD_SIMD_FLOAT __m256 + +#ifdef NO_AVX2 +#define MD_SIMD_MASK __m256 #else - x = _mm256_load_pd(&c0[d * CLUSTER_DIM_M]); +#define MD_SIMD_MASK __mmask8 #endif - return x; + +static inline MD_SIMD_FLOAT simd_broadcast(float scalar) { return _mm256_set1_ps(scalar); } +static inline MD_SIMD_FLOAT simd_zero() { return _mm256_set1_ps(0.0); } +static inline MD_SIMD_FLOAT simd_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_add_ps(a, b); } +static inline MD_SIMD_FLOAT simd_sub(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_sub_ps(a, b); } +static inline MD_SIMD_FLOAT simd_mul(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_mul_ps(a, b); } +static inline MD_SIMD_FLOAT simd_load(MD_FLOAT *p) { return _mm256_load_ps(p); } + +#ifdef NO_AVX2 + +#error "AVX intrinsincs with single-precision not implemented!" + +#else // AVX2 + +static inline MD_SIMD_FLOAT simd_reciprocal(MD_SIMD_FLOAT a) { return _mm256_rcp14_ps(a); } +static inline MD_SIMD_FLOAT simd_fma(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_FLOAT c) { return _mm256_fmadd_ps(a, b, c); } +static inline MD_SIMD_FLOAT simd_masked_add(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b, MD_SIMD_MASK m) { return _mm256_mask_add_ps(a, m, a, b); } +static inline MD_SIMD_MASK simd_mask_cond_lt(MD_SIMD_FLOAT a, MD_SIMD_FLOAT b) { return _mm256_cmp_pd_mask(a, b, _CMP_LT_OQ); } +static inline MD_SIMD_MASK simd_mask_and(MD_SIMD_MASK a, MD_SIMD_MASK b) { return _kand_mask8(a, b); } +static inline MD_SIMD_MASK simd_mask_from_u32(unsigned int a) { return _cvtu32_mask8(a); } +static inline unsigned int simd_mask_to_u32(MD_SIMD_MASK a) { return _cvtmask8_u32(a); } +static inline MD_FLOAT simd_h_reduce_sum(MD_SIMD_FLOAT a) { + __m128 t0; + t0 = _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 0x1)); + t0 = _mm_add_ps(t0, _mm_permute_ps(t0, _MM_SHUFFLE(1, 0, 3, 2))); + t0 = _mm_add_ss(t0, _mm_permute_ps(t0, _MM_SHUFFLE(0, 3, 2, 1))); + return *((MD_FLOAT *) &t0); } -#endif +#endif // NO_AVX2 + +#endif // PRECISION + +#endif // AVX or AVX2 static inline void simd_print_real(const char *ref, MD_SIMD_FLOAT a) { double x[VECTOR_WIDTH]; diff --git a/gromacs/main-stub.c b/gromacs/main-stub.c index 4da90db..c7596e7 100644 --- a/gromacs/main-stub.c +++ b/gromacs/main-stub.c @@ -217,8 +217,6 @@ int main(int argc, const char *argv[]) { if(!csv) { printf("Number of timesteps: %d\n", param.ntimes); - printf("Number of times to compute the atoms loop: %d\n", ATOMS_LOOP_RUNS); - printf("Number of times to compute the neighbors loop: %d\n", NEIGHBORS_LOOP_RUNS); printf("System size (unit cells): %dx%dx%d\n", param.nx, param.ny, param.nz); printf("Atoms per unit cell: %d\n", atoms_per_unit_cell); printf("Total number of atoms: %d\n", atom->Nlocal); @@ -257,9 +255,8 @@ int main(int argc, const char *argv[]) { E = getTimeStamp(); double T_accum = E-S; double freq_hz = param.proc_freq * 1.e9; - const double repeats = ATOMS_LOOP_RUNS * NEIGHBORS_LOOP_RUNS; - const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes * repeats); - const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes * repeats) * freq_hz; + const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes); + const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes) * freq_hz; const double cycles_per_neigh = cycles_per_atom / (double)(atoms_per_unit_cell - 1); if(!csv) { diff --git a/gromacs/main.c b/gromacs/main.c index 0bc8a3c..a36af9d 100644 --- a/gromacs/main.c +++ b/gromacs/main.c @@ -43,14 +43,9 @@ extern double computeForceLJ_ref(Parameter*, Atom*, Neighbor*, Stats*); extern double computeForceLJ_4xn(Parameter*, Atom*, Neighbor*, Stats*); +extern double computeForceLJ_2xnn(Parameter*, Atom*, Neighbor*, Stats*); extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*); -#ifdef USE_REFERENCE_VERSION -# define computeForceLJ computeForceLJ_ref -#else -# define computeForceLJ computeForceLJ_4xn -#endif - double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) { if(param->force_field == FF_EAM) { initEam(eam, param); } double S, E; @@ -75,6 +70,7 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats * setupThermo(param, atom->Natoms); if(param->input_file == NULL) { adjustThermo(param, atom); } buildClusters(atom); + defineJClusters(atom); setupPbc(atom, param); binClusters(atom); buildNeighbor(atom, neighbor); @@ -89,6 +85,7 @@ double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) { updateSingleAtoms(atom); updateAtomsPbc(atom, param); buildClusters(atom); + defineJClusters(atom); setupPbc(atom, param); binClusters(atom); buildNeighbor(atom, neighbor); @@ -101,17 +98,18 @@ void initialIntegrate(Parameter *param, Atom *atom) { DEBUG_MESSAGE("initialIntegrate start\n"); for(int ci = 0; ci < atom->Nclusters_local; ci++) { - MD_FLOAT *ciptr = cluster_pos_ptr(ci); - MD_FLOAT *civptr = cluster_velocity_ptr(ci); - MD_FLOAT *cifptr = cluster_force_ptr(ci); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base]; + MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; - for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) { - cluster_x(civptr, cii) += param->dtforce * cluster_x(cifptr, cii); - cluster_y(civptr, cii) += param->dtforce * cluster_y(cifptr, cii); - cluster_z(civptr, cii) += param->dtforce * cluster_z(cifptr, cii); - cluster_x(ciptr, cii) += param->dt * cluster_x(civptr, cii); - cluster_y(ciptr, cii) += param->dt * cluster_y(civptr, cii); - cluster_z(ciptr, cii) += param->dt * cluster_z(civptr, cii); + for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) { + ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii]; + ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii]; + ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii]; + ci_x[CL_X_OFFSET + cii] += param->dt * ci_v[CL_X_OFFSET + cii]; + ci_x[CL_Y_OFFSET + cii] += param->dt * ci_v[CL_Y_OFFSET + cii]; + ci_x[CL_Z_OFFSET + cii] += param->dt * ci_v[CL_Z_OFFSET + cii]; } } @@ -122,13 +120,14 @@ void finalIntegrate(Parameter *param, Atom *atom) { DEBUG_MESSAGE("finalIntegrate start\n"); for(int ci = 0; ci < atom->Nclusters_local; ci++) { - MD_FLOAT *civptr = cluster_velocity_ptr(ci); - MD_FLOAT *cifptr = cluster_force_ptr(ci); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base]; + MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; - for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) { - cluster_x(civptr, cii) += param->dtforce * cluster_x(cifptr, cii); - cluster_y(civptr, cii) += param->dtforce * cluster_y(cifptr, cii); - cluster_z(civptr, cii) += param->dtforce * cluster_z(cifptr, cii); + for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) { + ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii]; + ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii]; + ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii]; } } @@ -316,6 +315,7 @@ int main(int argc, char** argv) { } timer[TOTAL] = getTimeStamp() - timer[TOTAL]; + updateSingleAtoms(&atom); computeThermo(-1, ¶m, &atom); if(param.xtc_file != NULL) { @@ -323,6 +323,7 @@ int main(int argc, char** argv) { } printf(HLINE); + printf("Kernel: %s, MxN: %dx%d, Vector width: %d\n", KERNEL_NAME, CLUSTER_M, CLUSTER_N, VECTOR_WIDTH); printf("Data layout for positions: %s\n", POS_DATA_LAYOUT); #if PRECISION == 1 printf("Using single precision floating point.\n"); diff --git a/gromacs/neighbor.c b/gromacs/neighbor.c index 3d86183..694abd7 100644 --- a/gromacs/neighbor.c +++ b/gromacs/neighbor.c @@ -39,8 +39,8 @@ static int nbinx, nbiny; static int mbinx, mbiny; // n bins in x, y static int *bincount; static int *bins; -static int *cluster_bincount; -static int *cluster_bins; +static int *bin_nclusters; +static int *bin_clusters; static int mbins; //total number of bins static int atoms_per_bin; // max atoms per bin static int clusters_per_bin; // max clusters per bin @@ -63,12 +63,12 @@ void initNeighbor(Neighbor *neighbor, Parameter *param) { cutneigh = param->cutneigh; nmax = 0; atoms_per_bin = 8; - clusters_per_bin = (atoms_per_bin / CLUSTER_DIM_M) + 4; + clusters_per_bin = (atoms_per_bin / CLUSTER_M) + 10; stencil = NULL; bins = NULL; bincount = NULL; - cluster_bins = NULL; - cluster_bincount = NULL; + bin_clusters = NULL; + bin_nclusters = NULL; neighbor->maxneighs = 100; neighbor->numneigh = NULL; neighbor->neighbors = NULL; @@ -91,7 +91,7 @@ void setupNeighbor(Parameter *param, Atom *atom) { MD_FLOAT zlo = 0.0; MD_FLOAT zhi = zprd; MD_FLOAT atom_density = ((MD_FLOAT)(atom->Nlocal)) / ((xhi - xlo) * (yhi - ylo) * (zhi - zlo)); - MD_FLOAT atoms_in_cell = MAX(CLUSTER_DIM_M, CLUSTER_DIM_N); + MD_FLOAT atoms_in_cell = MAX(CLUSTER_M, CLUSTER_N); MD_FLOAT targetsizex = cbrt(atoms_in_cell / atom_density); MD_FLOAT targetsizey = cbrt(atoms_in_cell / atom_density); nbinx = MAX(1, (int)ceil((xhi - xlo) / targetsizex)); @@ -103,20 +103,16 @@ void setupNeighbor(Parameter *param, Atom *atom) { cutneighsq = cutneigh * cutneigh; coord = xlo - cutneigh - SMALL * xprd; - mbinxlo = (int) (coord * bininvx); - if (coord < 0.0) { - mbinxlo = mbinxlo - 1; - } + mbinxlo = (int)(coord * bininvx); + if(coord < 0.0) { mbinxlo = mbinxlo - 1; } coord = xhi + cutneigh + SMALL * xprd; - mbinxhi = (int) (coord * bininvx); + mbinxhi = (int)(coord * bininvx); coord = ylo - cutneigh - SMALL * yprd; - mbinylo = (int) (coord * bininvy); - if (coord < 0.0) { - mbinylo = mbinylo - 1; - } + mbinylo = (int)(coord * bininvy); + if(coord < 0.0) { mbinylo = mbinylo - 1; } coord = yhi + cutneigh + SMALL * yprd; - mbinyhi = (int) (coord * bininvy); + mbinyhi = (int)(coord * bininvy); mbinxlo = mbinxlo - 1; mbinxhi = mbinxhi + 1; @@ -126,16 +122,12 @@ void setupNeighbor(Parameter *param, Atom *atom) { mbinyhi = mbinyhi + 1; mbiny = mbinyhi - mbinylo + 1; - nextx = (int) (cutneigh * bininvx); + nextx = (int)(cutneigh * bininvx); + nexty = (int)(cutneigh * bininvy); if(nextx * binsizex < FACTOR * cutneigh) nextx++; - - nexty = (int) (cutneigh * bininvy); if(nexty * binsizey < FACTOR * cutneigh) nexty++; - if (stencil) { - free(stencil); - } - + if (stencil) { free(stencil); } stencil = (int *) malloc((2 * nexty + 1) * (2 * nextx + 1) * sizeof(int)); nstencil = 0; @@ -147,19 +139,15 @@ void setupNeighbor(Parameter *param, Atom *atom) { } } + if(bincount) { free(bincount); } + if(bins) { free(bins); } + if(bin_nclusters) { free(bin_nclusters); } + if(bin_clusters) { free(bin_clusters); } mbins = mbinx * mbiny; - - if (bincount) { free(bincount); } bincount = (int*) malloc(mbins * sizeof(int)); - - if (bins) { free(bins); } bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int)); - - if (cluster_bincount) { free(cluster_bincount); } - cluster_bincount = (int*) malloc(mbins * sizeof(int)); - - if (cluster_bins) { free(cluster_bins); } - cluster_bins = (int*) malloc(mbins * clusters_per_bin * sizeof(int)); + bin_nclusters = (int*) malloc(mbins * sizeof(int)); + bin_clusters = (int*) malloc(mbins * clusters_per_bin * sizeof(int)); /* DEBUG_MESSAGE("lo, hi = (%e, %e, %e), (%e, %e, %e)\n", xlo, ylo, zlo, xhi, yhi, zhi); @@ -171,20 +159,20 @@ void setupNeighbor(Parameter *param, Atom *atom) { } MD_FLOAT getBoundingBoxDistanceSq(Atom *atom, int ci, int cj) { - MD_FLOAT dl = atom->clusters[ci].bbminx - atom->clusters[cj].bbmaxx; - MD_FLOAT dh = atom->clusters[cj].bbminx - atom->clusters[ci].bbmaxx; + MD_FLOAT dl = atom->iclusters[ci].bbminx - atom->jclusters[cj].bbmaxx; + MD_FLOAT dh = atom->jclusters[cj].bbminx - atom->iclusters[ci].bbmaxx; MD_FLOAT dm = MAX(dl, dh); MD_FLOAT dm0 = MAX(dm, 0.0); MD_FLOAT d2 = dm0 * dm0; - dl = atom->clusters[ci].bbminy - atom->clusters[cj].bbmaxy; - dh = atom->clusters[cj].bbminy - atom->clusters[ci].bbmaxy; + dl = atom->iclusters[ci].bbminy - atom->jclusters[cj].bbmaxy; + dh = atom->jclusters[cj].bbminy - atom->iclusters[ci].bbmaxy; dm = MAX(dl, dh); dm0 = MAX(dm, 0.0); d2 += dm0 * dm0; - dl = atom->clusters[ci].bbminz - atom->clusters[cj].bbmaxz; - dh = atom->clusters[cj].bbminz - atom->clusters[ci].bbmaxz; + dl = atom->iclusters[ci].bbminz - atom->jclusters[cj].bbmaxz; + dh = atom->jclusters[cj].bbminz - atom->iclusters[ci].bbmaxz; dm = MAX(dl, dh); dm0 = MAX(dm, 0.0); d2 += dm0 * dm0; @@ -192,14 +180,16 @@ MD_FLOAT getBoundingBoxDistanceSq(Atom *atom, int ci, int cj) { } int atomDistanceInRange(Atom *atom, int ci, int cj, MD_FLOAT rsq) { - MD_FLOAT *ciptr = cluster_pos_ptr(ci); - MD_FLOAT *cjptr = cluster_pos_ptr(cj); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; - for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) { - for(int cjj = 0; cjj < atom->clusters[cj].natoms; cjj++) { - MD_FLOAT delx = cluster_x(ciptr, cii) - cluster_x(cjptr, cjj); - MD_FLOAT dely = cluster_y(ciptr, cii) - cluster_y(cjptr, cjj); - MD_FLOAT delz = cluster_z(ciptr, cii) - cluster_z(cjptr, cjj); + for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) { + for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) { + MD_FLOAT delx = ci_x[CL_X_OFFSET + cii] - cj_x[CL_X_OFFSET + cjj]; + MD_FLOAT dely = ci_x[CL_Y_OFFSET + cii] - cj_x[CL_Y_OFFSET + cjj]; + MD_FLOAT delz = ci_x[CL_Z_OFFSET + cii] - cj_x[CL_Z_OFFSET + cjj]; if(delx * delx + dely * dely + delz * delz < rsq) { return 1; } @@ -211,11 +201,10 @@ int atomDistanceInRange(Atom *atom, int ci, int cj, MD_FLOAT rsq) { void buildNeighbor(Atom *atom, Neighbor *neighbor) { DEBUG_MESSAGE("buildNeighbor start\n"); - int nall = atom->Nclusters_local + atom->Nclusters_ghost; /* extend atom arrays if necessary */ - if(nall > nmax) { - nmax = nall; + if(atom->Nclusters_local > nmax) { + nmax = atom->Nclusters_local; if(neighbor->numneigh) free(neighbor->numneigh); if(neighbor->neighbors) free(neighbor->neighbors); neighbor->numneigh = (int*) malloc(nmax * sizeof(int)); @@ -234,22 +223,22 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { resize = 0; for(int ci = 0; ci < atom->Nclusters_local; ci++) { - int* neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]); + int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]); int n = 0; - int ibin = atom->clusters[ci].bin; - MD_FLOAT ibb_xmin = atom->clusters[ci].bbminx; - MD_FLOAT ibb_xmax = atom->clusters[ci].bbmaxx; - MD_FLOAT ibb_ymin = atom->clusters[ci].bbminy; - MD_FLOAT ibb_ymax = atom->clusters[ci].bbmaxy; - MD_FLOAT ibb_zmin = atom->clusters[ci].bbminz; - MD_FLOAT ibb_zmax = atom->clusters[ci].bbmaxz; + int ibin = atom->icluster_bin[ci]; + MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx; + MD_FLOAT ibb_xmax = atom->iclusters[ci].bbmaxx; + MD_FLOAT ibb_ymin = atom->iclusters[ci].bbminy; + MD_FLOAT ibb_ymax = atom->iclusters[ci].bbmaxy; + MD_FLOAT ibb_zmin = atom->iclusters[ci].bbminz; + MD_FLOAT ibb_zmax = atom->iclusters[ci].bbmaxz; for(int k = 0; k < nstencil; k++) { int jbin = ibin + stencil[k]; - int *loc_bin = &cluster_bins[jbin * clusters_per_bin]; + int *loc_bin = &bin_clusters[jbin * clusters_per_bin]; int cj, m = -1; MD_FLOAT jbb_xmin, jbb_xmax, jbb_ymin, jbb_ymax, jbb_zmin, jbb_zmax; - const int c = cluster_bincount[jbin]; + const int c = bin_nclusters[jbin]; if(c > 0) { MD_FLOAT dl, dh, dm, dm0, d_bb_sq; @@ -257,8 +246,8 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { do { m++; cj = loc_bin[m]; - jbb_zmin = atom->clusters[cj].bbminz; - jbb_zmax = atom->clusters[cj].bbmaxz; + jbb_zmin = atom->jclusters[cj].bbminz; + jbb_zmax = atom->jclusters[cj].bbmaxz; dl = ibb_zmin - jbb_zmax; dh = jbb_zmin - ibb_zmax; dm = MAX(dl, dh); @@ -266,10 +255,10 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { d_bb_sq = dm0 * dm0; } while(m + 1 < c && d_bb_sq > cutneighsq); - jbb_xmin = atom->clusters[cj].bbminx; - jbb_xmax = atom->clusters[cj].bbmaxx; - jbb_ymin = atom->clusters[cj].bbminy; - jbb_ymax = atom->clusters[cj].bbmaxy; + jbb_xmin = atom->jclusters[cj].bbminx; + jbb_xmax = atom->jclusters[cj].bbmaxx; + jbb_ymin = atom->jclusters[cj].bbminy; + jbb_ymax = atom->jclusters[cj].bbmaxy; while(m < c) { dl = ibb_zmin - jbb_zmax; @@ -303,20 +292,21 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { m++; if(m < c) { cj = loc_bin[m]; - jbb_xmin = atom->clusters[cj].bbminx; - jbb_xmax = atom->clusters[cj].bbmaxx; - jbb_ymin = atom->clusters[cj].bbminy; - jbb_ymax = atom->clusters[cj].bbmaxy; - jbb_zmin = atom->clusters[cj].bbminz; - jbb_zmax = atom->clusters[cj].bbmaxz; + jbb_xmin = atom->jclusters[cj].bbminx; + jbb_xmax = atom->jclusters[cj].bbmaxx; + jbb_ymin = atom->jclusters[cj].bbminy; + jbb_ymax = atom->jclusters[cj].bbmaxy; + jbb_zmin = atom->jclusters[cj].bbminz; + jbb_zmax = atom->jclusters[cj].bbmaxz; } } } } - if(CLUSTER_DIM_N > CLUSTER_DIM_M) { - while(n % (CLUSTER_DIM_N / CLUSTER_DIM_M)) { - neighptr[n++] = nall - 1; // Last cluster is always a dummy cluster + // Fill neighbor list with dummy values to fit vector width + if(CLUSTER_N < VECTOR_WIDTH) { + while(n % (VECTOR_WIDTH / CLUSTER_N)) { + neighptr[n++] = atom->dummy_cj; // Last cluster is always a dummy cluster } } @@ -341,38 +331,40 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { /* DEBUG_MESSAGE("\ncutneighsq = %f, rbb_sq = %f\n", cutneighsq, rbb_sq); for(int ci = 0; ci < 6; ci++) { - MD_FLOAT *ciptr = cluster_pos_ptr(ci); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; int* neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]); DEBUG_MESSAGE("Cluster %d, bbx = {%f, %f}, bby = {%f, %f}, bbz = {%f, %f}\n", ci, - atom->clusters[ci].bbminx, - atom->clusters[ci].bbmaxx, - atom->clusters[ci].bbminy, - atom->clusters[ci].bbmaxy, - atom->clusters[ci].bbminz, - atom->clusters[ci].bbmaxz); + atom->iclusters[ci].bbminx, + atom->iclusters[ci].bbmaxx, + atom->iclusters[ci].bbminy, + atom->iclusters[ci].bbmaxy, + atom->iclusters[ci].bbminz, + atom->iclusters[ci].bbmaxz); - for(int cii = 0; cii < CLUSTER_DIM_M; cii++) { - DEBUG_MESSAGE("%f, %f, %f\n", cluster_x(ciptr, cii), cluster_y(ciptr, cii), cluster_z(ciptr, cii)); + for(int cii = 0; cii < CLUSTER_M; cii++) { + DEBUG_MESSAGE("%f, %f, %f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]); } DEBUG_MESSAGE("Neighbors:\n"); for(int k = 0; k < neighbor->numneigh[ci]; k++) { - const int cj = neighptr[k]; - MD_FLOAT *cjptr = cluster_pos_ptr(cj); + int cj = neighptr[k]; + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; DEBUG_MESSAGE(" Cluster %d, bbx = {%f, %f}, bby = {%f, %f}, bbz = {%f, %f}\n", cj, - atom->clusters[cj].bbminx, - atom->clusters[cj].bbmaxx, - atom->clusters[cj].bbminy, - atom->clusters[cj].bbmaxy, - atom->clusters[cj].bbminz, - atom->clusters[cj].bbmaxz); + atom->jclusters[cj].bbminx, + atom->jclusters[cj].bbmaxx, + atom->jclusters[cj].bbminy, + atom->jclusters[cj].bbmaxy, + atom->jclusters[cj].bbminz, + atom->jclusters[cj].bbmaxz); - for(int cjj = 0; cjj < CLUSTER_DIM_M; cjj++) { - DEBUG_MESSAGE(" %f, %f, %f\n", cluster_x(cjptr, cjj), cluster_y(cjptr, cjj), cluster_z(cjptr, cjj)); + for(int cjj = 0; cjj < CLUSTER_N; cjj++) { + DEBUG_MESSAGE(" %f, %f, %f\n", cj_x[CL_X_OFFSET + cjj], cj_x[CL_Y_OFFSET + cjj], cj_x[CL_Z_OFFSET + cjj]); } } } @@ -383,7 +375,6 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) { DEBUG_MESSAGE("pruneNeighbor start\n"); - int nall = atom->Nclusters_local + atom->Nclusters_ghost; //MD_FLOAT cutsq = param->cutforce * param->cutforce; MD_FLOAT cutsq = cutneighsq; @@ -393,8 +384,8 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) { int k = 0; // Remove dummy clusters if necessary - if(CLUSTER_DIM_N > CLUSTER_DIM_M) { - while(neighs[numneighs - 1] == nall - 1) { + if(CLUSTER_N < VECTOR_WIDTH) { + while(neighs[numneighs - 1] == atom->dummy_cj) { numneighs--; } } @@ -410,9 +401,9 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) { } // Readd dummy clusters if necessary - if(CLUSTER_DIM_N > CLUSTER_DIM_M) { - while(numneighs % (CLUSTER_DIM_N / CLUSTER_DIM_M)) { - neighs[numneighs++] = nall - 1; // Last cluster is always a dummy cluster + if(CLUSTER_N < VECTOR_WIDTH) { + while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) { + neighs[numneighs++] = atom->dummy_cj; // Last cluster is always a dummy cluster } } @@ -558,34 +549,37 @@ void buildClusters(Atom *atom) { for(int bin = 0; bin < mbins; bin++) { int c = bincount[bin]; int ac = 0; - const int nclusters = ((c + CLUSTER_DIM_M - 1) / CLUSTER_DIM_M); + int nclusters = ((c + CLUSTER_M - 1) / CLUSTER_M); + if(CLUSTER_N > CLUSTER_M && nclusters % 2) { nclusters++; } for(int cl = 0; cl < nclusters; cl++) { const int ci = atom->Nclusters_local; - if(ci >= atom->Nclusters_max) { growClusters(atom); } - MD_FLOAT *cptr = cluster_pos_ptr(ci); - MD_FLOAT *cvptr = cluster_velocity_ptr(ci); + int ci_sca_base = CI_SCALAR_BASE_INDEX(ci); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base]; + int *ci_type = &atom->cl_type[ci_sca_base]; MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY; MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY; MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY; - atom->clusters[ci].natoms = 0; - for(int cii = 0; cii < CLUSTER_DIM_M; cii++) { + atom->iclusters[ci].natoms = 0; + for(int cii = 0; cii < CLUSTER_M; cii++) { if(ac < c) { int i = bins[bin * atoms_per_bin + ac]; MD_FLOAT xtmp = atom_x(i); MD_FLOAT ytmp = atom_y(i); MD_FLOAT ztmp = atom_z(i); - cluster_x(cptr, cii) = xtmp; - cluster_y(cptr, cii) = ytmp; - cluster_z(cptr, cii) = ztmp; - cluster_x(cvptr, cii) = atom->vx[i]; - cluster_y(cvptr, cii) = atom->vy[i]; - cluster_z(cvptr, cii) = atom->vz[i]; + ci_x[CL_X_OFFSET + cii] = xtmp; + ci_x[CL_Y_OFFSET + cii] = ytmp; + ci_x[CL_Z_OFFSET + cii] = ztmp; + ci_v[CL_X_OFFSET + cii] = atom->vx[i]; + ci_v[CL_Y_OFFSET + cii] = atom->vy[i]; + ci_v[CL_Z_OFFSET + cii] = atom->vz[i]; // TODO: To create the bounding boxes faster, we can use SIMD operations if(bbminx > xtmp) { bbminx = xtmp; } @@ -595,24 +589,24 @@ void buildClusters(Atom *atom) { if(bbminz > ztmp) { bbminz = ztmp; } if(bbmaxz < ztmp) { bbmaxz = ztmp; } - atom->clusters[ci].type[cii] = atom->type[i]; - atom->clusters[ci].natoms++; + ci_type[cii] = atom->type[i]; + atom->iclusters[ci].natoms++; } else { - cluster_x(cptr, cii) = INFINITY; - cluster_y(cptr, cii) = INFINITY; - cluster_z(cptr, cii) = INFINITY; + ci_x[CL_X_OFFSET + cii] = INFINITY; + ci_x[CL_Y_OFFSET + cii] = INFINITY; + ci_x[CL_Z_OFFSET + cii] = INFINITY; } ac++; } - atom->clusters[ci].bin = bin; - atom->clusters[ci].bbminx = bbminx; - atom->clusters[ci].bbmaxx = bbmaxx; - atom->clusters[ci].bbminy = bbminy; - atom->clusters[ci].bbmaxy = bbmaxy; - atom->clusters[ci].bbminz = bbminz; - atom->clusters[ci].bbmaxz = bbmaxz; + atom->icluster_bin[ci] = bin; + atom->iclusters[ci].bbminx = bbminx; + atom->iclusters[ci].bbmaxx = bbmaxx; + atom->iclusters[ci].bbminy = bbminy; + atom->iclusters[ci].bbmaxy = bbmaxy; + atom->iclusters[ci].bbminz = bbminz; + atom->iclusters[ci].bbmaxz = bbmaxz; atom->Nclusters_local++; } } @@ -620,6 +614,94 @@ void buildClusters(Atom *atom) { DEBUG_MESSAGE("buildClusters end\n"); } +void defineJClusters(Atom *atom) { + DEBUG_MESSAGE("defineJClusters start\n"); + + for(int ci = 0; ci < atom->Nclusters_local; ci++) { + int cj0 = CJ0_FROM_CI(ci); + + if(CLUSTER_M == CLUSTER_N) { + atom->jclusters[cj0].bbminx = atom->iclusters[ci].bbminx; + atom->jclusters[cj0].bbmaxx = atom->iclusters[ci].bbmaxx; + atom->jclusters[cj0].bbminy = atom->iclusters[ci].bbminy; + atom->jclusters[cj0].bbmaxy = atom->iclusters[ci].bbmaxy; + atom->jclusters[cj0].bbminz = atom->iclusters[ci].bbminz; + atom->jclusters[cj0].bbmaxz = atom->iclusters[ci].bbmaxz; + atom->jclusters[cj0].natoms = atom->iclusters[ci].natoms; + + } else if(CLUSTER_M > CLUSTER_N) { + int cj1 = CJ1_FROM_CI(ci); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY; + MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY; + MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY; + + for(int cii = 0; cii < MAX(atom->iclusters[ci].natoms, CLUSTER_N); cii++) { + MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii]; + MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii]; + MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii]; + + // TODO: To create the bounding boxes faster, we can use SIMD operations + if(bbminx > xtmp) { bbminx = xtmp; } + if(bbmaxx < xtmp) { bbmaxx = xtmp; } + if(bbminy > ytmp) { bbminy = ytmp; } + if(bbmaxy < ytmp) { bbmaxy = ytmp; } + if(bbminz > ztmp) { bbminz = ztmp; } + if(bbmaxz < ztmp) { bbmaxz = ztmp; } + } + + atom->jclusters[cj0].bbminx = bbminx; + atom->jclusters[cj0].bbmaxx = bbmaxx; + atom->jclusters[cj0].bbminy = bbminy; + atom->jclusters[cj0].bbmaxy = bbmaxy; + atom->jclusters[cj0].bbminz = bbminz; + atom->jclusters[cj0].bbmaxz = bbmaxz; + atom->jclusters[cj0].natoms = MAX(atom->iclusters[ci].natoms, CLUSTER_N); + + bbminx = INFINITY, bbmaxx = -INFINITY; + bbminy = INFINITY, bbmaxy = -INFINITY; + bbminz = INFINITY, bbmaxz = -INFINITY; + + for(int cii = CLUSTER_N; cii < atom->iclusters[ci].natoms; cii++) { + MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii]; + MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii]; + MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii]; + + // TODO: To create the bounding boxes faster, we can use SIMD operations + if(bbminx > xtmp) { bbminx = xtmp; } + if(bbmaxx < xtmp) { bbmaxx = xtmp; } + if(bbminy > ytmp) { bbminy = ytmp; } + if(bbmaxy < ytmp) { bbmaxy = ytmp; } + if(bbminz > ztmp) { bbminz = ztmp; } + if(bbmaxz < ztmp) { bbmaxz = ztmp; } + } + + atom->jclusters[cj1].bbminx = bbminx; + atom->jclusters[cj1].bbmaxx = bbmaxx; + atom->jclusters[cj1].bbminy = bbminy; + atom->jclusters[cj1].bbmaxy = bbmaxy; + atom->jclusters[cj1].bbminz = bbminz; + atom->jclusters[cj1].bbmaxz = bbmaxz; + atom->jclusters[cj1].natoms = MIN(0, atom->iclusters[ci].natoms - CLUSTER_N); + + } else { + if(ci % 2 == 0) { + const int ci1 = ci + 1; + atom->jclusters[cj0].bbminx = MIN(atom->iclusters[ci].bbminx, atom->iclusters[ci1].bbminx); + atom->jclusters[cj0].bbmaxx = MAX(atom->iclusters[ci].bbmaxx, atom->iclusters[ci1].bbmaxx); + atom->jclusters[cj0].bbminy = MIN(atom->iclusters[ci].bbminy, atom->iclusters[ci1].bbminy); + atom->jclusters[cj0].bbmaxy = MAX(atom->iclusters[ci].bbmaxy, atom->iclusters[ci1].bbmaxy); + atom->jclusters[cj0].bbminz = MIN(atom->iclusters[ci].bbminz, atom->iclusters[ci1].bbminz); + atom->jclusters[cj0].bbmaxz = MAX(atom->iclusters[ci].bbmaxz, atom->iclusters[ci1].bbmaxz); + atom->jclusters[cj0].natoms = atom->iclusters[ci].natoms + atom->iclusters[ci1].natoms; + } + } + } + + DEBUG_MESSAGE("defineJClusters end\n"); +} + void binClusters(Atom *atom) { DEBUG_MESSAGE("binClusters start\n"); @@ -629,7 +711,7 @@ void binClusters(Atom *atom) { MD_FLOAT *cptr = cluster_pos_ptr(ci); DEBUG_MESSAGE("Cluster %d:\n", ci); DEBUG_MESSAGE("bin=%d, Natoms=%d, bbox={%f,%f},{%f,%f},{%f,%f}\n", - atom->clusters[ci].bin, + atom->icluster_bin[ci], atom->clusters[ci].natoms, atom->clusters[ci].bbminx, atom->clusters[ci].bbmaxx, @@ -638,104 +720,120 @@ void binClusters(Atom *atom) { atom->clusters[ci].bbminz, atom->clusters[ci].bbmaxz); - for(int cii = 0; cii < CLUSTER_DIM_M; cii++) { + for(int cii = 0; cii < CLUSTER_M; cii++) { DEBUG_MESSAGE("%f, %f, %f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii)); } } */ const int nlocal = atom->Nclusters_local; - int resize = 1; + const int jfac = MAX(1, CLUSTER_N / CLUSTER_M); + const int ncj = atom->Nclusters_local / jfac; + int resize = 1; while(resize > 0) { resize = 0; for(int bin = 0; bin < mbins; bin++) { - cluster_bincount[bin] = 0; + bin_nclusters[bin] = 0; } for(int ci = 0; ci < nlocal && !resize; ci++) { - int bin = atom->clusters[ci].bin; - int c = cluster_bincount[bin]; - if(c < clusters_per_bin) { - cluster_bins[bin * clusters_per_bin + c] = ci; - cluster_bincount[bin]++; - } else { - resize = 1; + // Assure we add this j-cluster only once in the bin + if(!(CLUSTER_M < CLUSTER_N && ci % 2)) { + int bin = atom->icluster_bin[ci]; + int c = bin_nclusters[bin]; + if(c + 1 < clusters_per_bin) { + bin_clusters[bin * clusters_per_bin + c] = CJ0_FROM_CI(ci); + bin_nclusters[bin]++; + + if(CLUSTER_M > CLUSTER_N) { + int cj1 = CJ1_FROM_CI(ci); + if(atom->jclusters[cj1].natoms > 0) { + bin_clusters[bin * clusters_per_bin + c + 1] = cj1; + bin_nclusters[bin]++; + } + } + } else { + resize = 1; + } } } for(int cg = 0; cg < atom->Nclusters_ghost && !resize; cg++) { - const int ci = nlocal + cg; - MD_FLOAT *cptr = cluster_pos_ptr(ci); - MD_FLOAT ci_minz = atom->clusters[ci].bbminz; - MD_FLOAT xtmp, ytmp; + const int cj = ncj + cg; int ix = -1, iy = -1; + MD_FLOAT xtmp, ytmp; - xtmp = cluster_x(cptr, 0); - ytmp = cluster_y(cptr, 0); - coord2bin2D(xtmp, ytmp, &ix, &iy); - ix = MAX(MIN(ix, mbinx - 1), 0); - iy = MAX(MIN(iy, mbiny - 1), 0); - for(int cii = 1; cii < atom->clusters[ci].natoms; cii++) { - int nix, niy; - xtmp = cluster_x(cptr, cii); - ytmp = cluster_y(cptr, cii); - coord2bin2D(xtmp, ytmp, &nix, &niy); - nix = MAX(MIN(nix, mbinx - 1), 0); - niy = MAX(MIN(niy, mbiny - 1), 0); + if(atom->jclusters[cj].natoms > 0) { + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; + MD_FLOAT cj_minz = atom->jclusters[cj].bbminz; - // Always put the cluster on the bin of its innermost atom so - // the cluster should be closer to local clusters - if(atom->PBCx[cg] > 0 && ix > nix) { ix = nix; } - if(atom->PBCx[cg] < 0 && ix < nix) { ix = nix; } - if(atom->PBCy[cg] > 0 && iy > niy) { iy = niy; } - if(atom->PBCy[cg] < 0 && iy < niy) { iy = niy; } - } + xtmp = cj_x[CL_X_OFFSET + 0]; + ytmp = cj_x[CL_Y_OFFSET + 0]; + coord2bin2D(xtmp, ytmp, &ix, &iy); + ix = MAX(MIN(ix, mbinx - 1), 0); + iy = MAX(MIN(iy, mbiny - 1), 0); + for(int cjj = 1; cjj < atom->jclusters[cj].natoms; cjj++) { + int nix, niy; + xtmp = cj_x[CL_X_OFFSET + cjj]; + ytmp = cj_x[CL_Y_OFFSET + cjj]; + coord2bin2D(xtmp, ytmp, &nix, &niy); + nix = MAX(MIN(nix, mbinx - 1), 0); + niy = MAX(MIN(niy, mbiny - 1), 0); - int bin = iy * mbinx + ix + 1; - int c = cluster_bincount[bin]; - if(c < clusters_per_bin) { - // Insert the current ghost cluster in the bin keeping clusters - // sorted by z coordinate - int inserted = 0; - for(int i = 0; i < c; i++) { - int last_cl = cluster_bins[bin * clusters_per_bin + i]; - if(atom->clusters[last_cl].bbminz > ci_minz) { - cluster_bins[bin * clusters_per_bin + i] = ci; + // Always put the cluster on the bin of its innermost atom so + // the cluster should be closer to local clusters + if(atom->PBCx[cg] > 0 && ix > nix) { ix = nix; } + if(atom->PBCx[cg] < 0 && ix < nix) { ix = nix; } + if(atom->PBCy[cg] > 0 && iy > niy) { iy = niy; } + if(atom->PBCy[cg] < 0 && iy < niy) { iy = niy; } + } - for(int j = i + 1; j <= c; j++) { - int tmp = cluster_bins[bin * clusters_per_bin + j]; - cluster_bins[bin * clusters_per_bin + j] = last_cl; - last_cl = tmp; + int bin = iy * mbinx + ix + 1; + int c = bin_nclusters[bin]; + if(c < clusters_per_bin) { + // Insert the current ghost cluster in the bin keeping clusters + // sorted by z coordinate + int inserted = 0; + for(int i = 0; i < c; i++) { + int last_cl = bin_clusters[bin * clusters_per_bin + i]; + if(atom->jclusters[last_cl].bbminz > cj_minz) { + bin_clusters[bin * clusters_per_bin + i] = cj; + + for(int j = i + 1; j <= c; j++) { + int tmp = bin_clusters[bin * clusters_per_bin + j]; + bin_clusters[bin * clusters_per_bin + j] = last_cl; + last_cl = tmp; + } + + inserted = 1; + break; } - - inserted = 1; - break; } - } - if(!inserted) { - cluster_bins[bin * clusters_per_bin + c] = ci; - } + if(!inserted) { + bin_clusters[bin * clusters_per_bin + c] = cj; + } - atom->clusters[ci].bin = bin; - cluster_bincount[bin]++; - } else { - resize = 1; + bin_nclusters[bin]++; + } else { + resize = 1; + } } } if(resize) { - free(cluster_bins); + free(bin_clusters); clusters_per_bin *= 2; - cluster_bins = (int*) malloc(mbins * clusters_per_bin * sizeof(int)); + bin_clusters = (int*) malloc(mbins * clusters_per_bin * sizeof(int)); } } /* - DEBUG_MESSAGE("cluster_bincount\n"); - for(int i = 0; i < mbins; i++) { DEBUG_MESSAGE("%d, ", cluster_bincount[i]); } + DEBUG_MESSAGE("bin_nclusters\n"); + for(int i = 0; i < mbins; i++) { DEBUG_MESSAGE("%d, ", bin_nclusters[i]); } DEBUG_MESSAGE("\n"); */ @@ -747,16 +845,17 @@ void updateSingleAtoms(Atom *atom) { int Natom = 0; for(int ci = 0; ci < atom->Nclusters_local; ci++) { - MD_FLOAT *cptr = cluster_pos_ptr(ci); - MD_FLOAT *cvptr = cluster_velocity_ptr(ci); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base]; - for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) { - atom_x(Natom) = cluster_x(cptr, cii); - atom_y(Natom) = cluster_y(cptr, cii); - atom_z(Natom) = cluster_z(cptr, cii); - atom->vx[Natom] = cluster_x(cvptr, cii); - atom->vy[Natom] = cluster_y(cvptr, cii); - atom->vz[Natom] = cluster_z(cvptr, cii); + for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) { + atom_x(Natom) = ci_x[CL_X_OFFSET + cii]; + atom_y(Natom) = ci_x[CL_Y_OFFSET + cii]; + atom_z(Natom) = ci_x[CL_Z_OFFSET + cii]; + atom->vx[Natom] = ci_v[CL_X_OFFSET + cii]; + atom->vy[Natom] = ci_v[CL_Y_OFFSET + cii]; + atom->vz[Natom] = ci_v[CL_Z_OFFSET + cii]; Natom++; } } diff --git a/gromacs/pbc.c b/gromacs/pbc.c index acb203e..301ee28 100644 --- a/gromacs/pbc.c +++ b/gromacs/pbc.c @@ -28,6 +28,7 @@ #include #include #include +#include #define DELTA 20000 @@ -45,28 +46,31 @@ void initPbc(Atom* atom) { /* update coordinates of ghost atoms */ /* uses mapping created in setupPbc */ void updatePbc(Atom *atom, Parameter *param, int firstUpdate) { - int *border_map = atom->border_map; - int nlocal = atom->Nclusters_local; + DEBUG_MESSAGE("updatePbc start\n"); + int jfac = MAX(1, CLUSTER_N / CLUSTER_M); + int ncj = atom->Nclusters_local / jfac; MD_FLOAT xprd = param->xprd; MD_FLOAT yprd = param->yprd; MD_FLOAT zprd = param->zprd; for(int cg = 0; cg < atom->Nclusters_ghost; cg++) { - const int ci = nlocal + cg; - MD_FLOAT *cptr = cluster_pos_ptr(ci); - MD_FLOAT *bmap_cptr = cluster_pos_ptr(border_map[cg]); + const int cj = ncj + cg; + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + int bmap_vec_base = CJ_VECTOR_BASE_INDEX(atom->border_map[cg]); + MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; + MD_FLOAT *bmap_x = &atom->cl_x[bmap_vec_base]; MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY; MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY; MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY; - for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) { - MD_FLOAT xtmp = cluster_x(bmap_cptr, cii) + atom->PBCx[cg] * xprd; - MD_FLOAT ytmp = cluster_y(bmap_cptr, cii) + atom->PBCy[cg] * yprd; - MD_FLOAT ztmp = cluster_z(bmap_cptr, cii) + atom->PBCz[cg] * zprd; + for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) { + MD_FLOAT xtmp = bmap_x[CL_X_OFFSET + cjj] + atom->PBCx[cg] * xprd; + MD_FLOAT ytmp = bmap_x[CL_Y_OFFSET + cjj] + atom->PBCy[cg] * yprd; + MD_FLOAT ztmp = bmap_x[CL_Z_OFFSET + cjj] + atom->PBCz[cg] * zprd; - cluster_x(cptr, cii) = xtmp; - cluster_y(cptr, cii) = ytmp; - cluster_z(cptr, cii) = ztmp; + cj_x[CL_X_OFFSET + cjj] = xtmp; + cj_x[CL_Y_OFFSET + cjj] = ytmp; + cj_x[CL_Z_OFFSET + cjj] = ztmp; if(firstUpdate) { // TODO: To create the bounding boxes faster, we can use SIMD operations @@ -80,20 +84,22 @@ void updatePbc(Atom *atom, Parameter *param, int firstUpdate) { } if(firstUpdate) { - for(int cii = atom->clusters[ci].natoms; cii < CLUSTER_DIM_M; cii++) { - cluster_x(cptr, cii) = INFINITY; - cluster_y(cptr, cii) = INFINITY; - cluster_z(cptr, cii) = INFINITY; + for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) { + cj_x[CL_X_OFFSET + cjj] = INFINITY; + cj_x[CL_Y_OFFSET + cjj] = INFINITY; + cj_x[CL_Z_OFFSET + cjj] = INFINITY; } - atom->clusters[ci].bbminx = bbminx; - atom->clusters[ci].bbmaxx = bbmaxx; - atom->clusters[ci].bbminy = bbminy; - atom->clusters[ci].bbmaxy = bbmaxy; - atom->clusters[ci].bbminz = bbminz; - atom->clusters[ci].bbmaxz = bbmaxz; + atom->jclusters[cj].bbminx = bbminx; + atom->jclusters[cj].bbmaxx = bbmaxx; + atom->jclusters[cj].bbminy = bbminy; + atom->jclusters[cj].bbmaxy = bbmaxy; + atom->jclusters[cj].bbminz = bbminz; + atom->jclusters[cj].bbmaxz = bbmaxz; } } + + DEBUG_MESSAGE("updatePbc end\n"); } /* relocate atoms that have left domain according @@ -130,15 +136,18 @@ void updateAtomsPbc(Atom *atom, Parameter *param) { * that are then enforced in updatePbc */ #define ADDGHOST(dx,dy,dz); \ Nghost++; \ - const int g_atom_idx = atom->Nclusters_local + Nghost; \ - border_map[Nghost] = ci; \ + const int cg = ncj + Nghost; \ + const int cj_natoms = atom->jclusters[cj].natoms; \ + atom->border_map[Nghost] = cj; \ atom->PBCx[Nghost] = dx; \ atom->PBCy[Nghost] = dy; \ atom->PBCz[Nghost] = dz; \ - atom->clusters[g_atom_idx].natoms = atom->clusters[ci].natoms; \ - Nghost_atoms += atom->clusters[g_atom_idx].natoms; \ - for(int cii = 0; cii < atom->clusters[ci].natoms; cii++) { \ - atom->clusters[g_atom_idx].type[cii] = atom->clusters[ci].type[cii]; \ + atom->jclusters[cg].natoms = cj_natoms; \ + Nghost_atoms += cj_natoms; \ + int cj_sca_base = CJ_SCALAR_BASE_INDEX(cj); \ + int cg_sca_base = CJ_SCALAR_BASE_INDEX(cg); \ + for(int cjj = 0; cjj < cj_natoms; cjj++) { \ + atom->cl_type[cg_sca_base + cjj] = atom->cl_type[cj_sca_base + cjj]; \ } /* internal subroutines */ @@ -153,80 +162,86 @@ void growPbc(Atom* atom) { } void setupPbc(Atom *atom, Parameter *param) { - int *border_map = atom->border_map; + DEBUG_MESSAGE("setupPbc start\n"); MD_FLOAT xprd = param->xprd; MD_FLOAT yprd = param->yprd; MD_FLOAT zprd = param->zprd; MD_FLOAT Cutneigh = param->cutneigh; + int jfac = MAX(1, CLUSTER_N / CLUSTER_M); + int ncj = atom->Nclusters_local / jfac; int Nghost = -1; int Nghost_atoms = 0; - for(int ci = 0; ci < atom->Nclusters_local; ci++) { - if (atom->Nclusters_local + Nghost + 7 >= atom->Nclusters_max) { - growClusters(atom); + for(int cj = 0; cj < ncj; cj++) { + if(atom->jclusters[cj].natoms > 0) { + if(atom->Nclusters_local + (Nghost + 7) * jfac >= atom->Nclusters_max) { + growClusters(atom); + } + + if((Nghost + 7) * jfac >= NmaxGhost) { + growPbc(atom); + } + + MD_FLOAT bbminx = atom->jclusters[cj].bbminx; + MD_FLOAT bbmaxx = atom->jclusters[cj].bbmaxx; + MD_FLOAT bbminy = atom->jclusters[cj].bbminy; + MD_FLOAT bbmaxy = atom->jclusters[cj].bbmaxy; + MD_FLOAT bbminz = atom->jclusters[cj].bbminz; + MD_FLOAT bbmaxz = atom->jclusters[cj].bbmaxz; + + /* Setup ghost atoms */ + /* 6 planes */ + if (bbminx < Cutneigh) { ADDGHOST(+1,0,0); } + if (bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,0,0); } + if (bbminy < Cutneigh) { ADDGHOST(0,+1,0); } + if (bbmaxy >= (yprd-Cutneigh)) { ADDGHOST(0,-1,0); } + if (bbminz < Cutneigh) { ADDGHOST(0,0,+1); } + if (bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,0,-1); } + /* 8 corners */ + if (bbminx < Cutneigh && bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(+1,+1,+1); } + if (bbminx < Cutneigh && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(+1,-1,+1); } + if (bbminx < Cutneigh && bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); } + if (bbminx < Cutneigh && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,-1,-1); } + if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(-1,+1,+1); } + if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(-1,-1,+1); } + if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,+1,-1); } + if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,-1,-1); } + /* 12 edges */ + if (bbminx < Cutneigh && bbminz < Cutneigh) { ADDGHOST(+1,0,+1); } + if (bbminx < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,0,-1); } + if (bbmaxx >= (xprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(-1,0,+1); } + if (bbmaxx >= (xprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,0,-1); } + if (bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(0,+1,+1); } + if (bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,+1,-1); } + if (bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(0,-1,+1); } + if (bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,-1,-1); } + if (bbminy < Cutneigh && bbminx < Cutneigh) { ADDGHOST(+1,+1,0); } + if (bbminy < Cutneigh && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,+1,0); } + if (bbmaxy >= (yprd-Cutneigh) && bbminx < Cutneigh) { ADDGHOST(+1,-1,0); } + if (bbmaxy >= (yprd-Cutneigh) && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); } } - - if (Nghost + 7 >= NmaxGhost) { - growPbc(atom); - border_map = atom->border_map; - } - - MD_FLOAT bbminx = atom->clusters[ci].bbminx; - MD_FLOAT bbmaxx = atom->clusters[ci].bbmaxx; - MD_FLOAT bbminy = atom->clusters[ci].bbminy; - MD_FLOAT bbmaxy = atom->clusters[ci].bbmaxy; - MD_FLOAT bbminz = atom->clusters[ci].bbminz; - MD_FLOAT bbmaxz = atom->clusters[ci].bbmaxz; - - /* Setup ghost atoms */ - /* 6 planes */ - if (bbminx < Cutneigh) { ADDGHOST(+1,0,0); } - if (bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,0,0); } - if (bbminy < Cutneigh) { ADDGHOST(0,+1,0); } - if (bbmaxy >= (yprd-Cutneigh)) { ADDGHOST(0,-1,0); } - if (bbminz < Cutneigh) { ADDGHOST(0,0,+1); } - if (bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,0,-1); } - /* 8 corners */ - if (bbminx < Cutneigh && bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(+1,+1,+1); } - if (bbminx < Cutneigh && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(+1,-1,+1); } - if (bbminx < Cutneigh && bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); } - if (bbminx < Cutneigh && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,-1,-1); } - if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(-1,+1,+1); } - if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(-1,-1,+1); } - if (bbmaxx >= (xprd-Cutneigh) && bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,+1,-1); } - if (bbmaxx >= (xprd-Cutneigh) && bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,-1,-1); } - /* 12 edges */ - if (bbminx < Cutneigh && bbminz < Cutneigh) { ADDGHOST(+1,0,+1); } - if (bbminx < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(+1,0,-1); } - if (bbmaxx >= (xprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(-1,0,+1); } - if (bbmaxx >= (xprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(-1,0,-1); } - if (bbminy < Cutneigh && bbminz < Cutneigh) { ADDGHOST(0,+1,+1); } - if (bbminy < Cutneigh && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,+1,-1); } - if (bbmaxy >= (yprd-Cutneigh) && bbminz < Cutneigh) { ADDGHOST(0,-1,+1); } - if (bbmaxy >= (yprd-Cutneigh) && bbmaxz >= (zprd-Cutneigh)) { ADDGHOST(0,-1,-1); } - if (bbminy < Cutneigh && bbminx < Cutneigh) { ADDGHOST(+1,+1,0); } - if (bbminy < Cutneigh && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,+1,0); } - if (bbmaxy >= (yprd-Cutneigh) && bbminx < Cutneigh) { ADDGHOST(+1,-1,0); } - if (bbmaxy >= (yprd-Cutneigh) && bbmaxx >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); } } - if(atom->Nclusters_local + Nghost + 1 >= atom->Nclusters_max) { + if(ncj + (Nghost + 1) * jfac >= atom->Nclusters_max) { growClusters(atom); } // Add dummy cluster at the end - MD_FLOAT *cptr = cluster_pos_ptr(atom->Nclusters_local + Nghost + 1); - for(int cii = 0; cii < CLUSTER_DIM_M; cii++) { - cluster_x(cptr, cii) = INFINITY; - cluster_y(cptr, cii) = INFINITY; - cluster_z(cptr, cii) = INFINITY; + int cj_vec_base = CJ_VECTOR_BASE_INDEX(ncj + Nghost + 1); + MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; + for(int cjj = 0; cjj < CLUSTER_N; cjj++) { + cj_x[CL_X_OFFSET + cjj] = INFINITY; + cj_x[CL_Y_OFFSET + cjj] = INFINITY; + cj_x[CL_Z_OFFSET + cjj] = INFINITY; } // increase by one to make it the ghost atom count + atom->dummy_cj = ncj + Nghost + 1; atom->Nghost = Nghost_atoms; atom->Nclusters_ghost = Nghost + 1; atom->Nclusters = atom->Nclusters_local + Nghost + 1; // Update created ghost clusters positions updatePbc(atom, param, 1); + DEBUG_MESSAGE("setupPbc end\n"); } diff --git a/gromacs/stats.c b/gromacs/stats.c index 5f62918..c3c9f43 100644 --- a/gromacs/stats.c +++ b/gromacs/stats.c @@ -18,11 +18,11 @@ void initStats(Stats *s) { void displayStatistics(Atom *atom, Parameter *param, Stats *stats, double *timer) { #ifdef COMPUTE_STATS - const int MxN = CLUSTER_DIM_M * CLUSTER_DIM_N; + const int MxN = CLUSTER_M * CLUSTER_N; double avg_atoms_cluster = (double)(atom->Nlocal) / (double)(atom->Nclusters_local); double force_useful_volume = 1e-9 * ( (double)(atom->Nlocal * (param->ntimes + 1)) * (sizeof(MD_FLOAT) * 6 + sizeof(int)) + (double)(stats->num_neighs) * (sizeof(MD_FLOAT) * 3 + sizeof(int)) ); - double avg_neigh_atom = (stats->num_neighs * CLUSTER_DIM_N) / (double)(atom->Nlocal * (param->ntimes + 1)); + double avg_neigh_atom = (stats->num_neighs * CLUSTER_N) / (double)(atom->Nlocal * (param->ntimes + 1)); double avg_neigh_cluster = (double)(stats->num_neighs) / (double)(stats->calculated_forces); double avg_simd = stats->force_iters / (double)(atom->Nlocal * (param->ntimes + 1)); diff --git a/gromacs/vtk.c b/gromacs/vtk.c index 35cc14e..2a29a26 100644 --- a/gromacs/vtk.c +++ b/gromacs/vtk.c @@ -27,9 +27,10 @@ int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep fprintf(fp, "DATASET UNSTRUCTURED_GRID\n"); fprintf(fp, "POINTS %d double\n", atom->Nlocal); for(int ci = 0; ci < atom->Nclusters_local; ++ci) { - MD_FLOAT *cptr = cluster_pos_ptr(ci); - for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) { - fprintf(fp, "%.4f %.4f %.4f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii)); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) { + fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]); } } fprintf(fp, "\n\n"); @@ -70,9 +71,10 @@ int write_ghost_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep fprintf(fp, "DATASET UNSTRUCTURED_GRID\n"); fprintf(fp, "POINTS %d double\n", atom->Nghost); for(int ci = atom->Nclusters_local; ci < atom->Nclusters_local + atom->Nclusters_ghost; ++ci) { - MD_FLOAT *cptr = cluster_pos_ptr(ci); - for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) { - fprintf(fp, "%.4f %.4f %.4f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii)); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) { + fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]); } } fprintf(fp, "\n\n"); @@ -116,18 +118,19 @@ int write_local_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int fprintf(fp, "DATASET POLYDATA\n"); fprintf(fp, "POINTS %d double\n", atom->Nlocal); for(int ci = 0; ci < N; ++ci) { - MD_FLOAT *cptr = cluster_pos_ptr(ci); - for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) { - fprintf(fp, "%.4f %.4f %.4f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii)); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) { + fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]); } - tot_lines += atom->clusters[ci].natoms; + tot_lines += atom->iclusters[ci].natoms; } fprintf(fp, "\n\n"); fprintf(fp, "LINES %d %d\n", N, N + tot_lines); for(int ci = 0; ci < N; ++ci) { - fprintf(fp, "%d ", atom->clusters[ci].natoms); - for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) { + fprintf(fp, "%d ", atom->iclusters[ci].natoms); + for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) { fprintf(fp, "%d ", i++); } @@ -157,18 +160,19 @@ int write_ghost_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int fprintf(fp, "DATASET POLYDATA\n"); fprintf(fp, "POINTS %d double\n", atom->Nghost); for(int ci = atom->Nclusters_local; ci < N; ++ci) { - MD_FLOAT *cptr = cluster_pos_ptr(ci); - for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) { - fprintf(fp, "%.4f %.4f %.4f\n", cluster_x(cptr, cii), cluster_y(cptr, cii), cluster_z(cptr, cii)); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) { + fprintf(fp, "%.4f %.4f %.4f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]); } - tot_lines += atom->clusters[ci].natoms; + tot_lines += atom->iclusters[ci].natoms; } fprintf(fp, "\n\n"); fprintf(fp, "LINES %d %d\n", atom->Nclusters_ghost, atom->Nclusters_ghost + tot_lines); for(int ci = atom->Nclusters_local; ci < N; ++ci) { - fprintf(fp, "%d ", atom->clusters[ci].natoms); - for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) { + fprintf(fp, "%d ", atom->iclusters[ci].natoms); + for(int cii = 0; cii < atom->iclusters[ci].natoms; ++cii) { fprintf(fp, "%d ", i++); } diff --git a/gromacs/xtc.c b/gromacs/xtc.c index d381f40..08fb0c9 100644 --- a/gromacs/xtc.c +++ b/gromacs/xtc.c @@ -52,11 +52,12 @@ void xtc_init(const char *filename, Atom *atom, int timestep) { void xtc_write(Atom *atom, int timestep, int write_pos, int write_vel) { int i = 0; for(int ci = 0; ci < atom->Nclusters_local; ++ci) { - MD_FLOAT *cptr = cluster_pos_ptr(ci); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; for(int cii = 0; cii < atom->clusters[ci].natoms; ++cii) { - x_buf[i][XX] = cluster_x(cptr, cii); - x_buf[i][YY] = cluster_y(cptr, cii); - x_buf[i][ZZ] = cluster_z(cptr, cii); + x_buf[i][XX] = ci_x[CL_X_OFFSET + cii]; + x_buf[i][YY] = ci_x[CL_Y_OFFSET + cii]; + x_buf[i][ZZ] = ci_x[CL_Z_OFFSET + cii]; i++; } } diff --git a/include_ISA.mk b/include_ISA.mk new file mode 100644 index 0000000..fa9b90b --- /dev/null +++ b/include_ISA.mk @@ -0,0 +1,18 @@ +ifeq ($(strip $(ISA)), SSE) +_VECTOR_WIDTH=2 +else ifeq ($(strip $(ISA)), AVX) +# Vector width is 4 but AVX2 instruction set is not supported +NO_AVX2=true +_VECTOR_WIDTH=4 +else ifeq ($(strip $(ISA)), AVX2) +_VECTOR_WIDTH=4 +else ifeq ($(strip $(ISA)), AVX512) +AVX512=true +_VECTOR_WIDTH=8 +endif + +ifeq ($(strip $(DATA_TYPE)), SP) +VECTOR_WIDTH=$(shell echo $$(( $(_VECTOR_WIDTH) * 2 ))) +else +VECTOR_WIDTH=$(_VECTOR_WIDTH) +endif diff --git a/lammps/main-stub.c b/lammps/main-stub.c index 4da90db..c7596e7 100644 --- a/lammps/main-stub.c +++ b/lammps/main-stub.c @@ -217,8 +217,6 @@ int main(int argc, const char *argv[]) { if(!csv) { printf("Number of timesteps: %d\n", param.ntimes); - printf("Number of times to compute the atoms loop: %d\n", ATOMS_LOOP_RUNS); - printf("Number of times to compute the neighbors loop: %d\n", NEIGHBORS_LOOP_RUNS); printf("System size (unit cells): %dx%dx%d\n", param.nx, param.ny, param.nz); printf("Atoms per unit cell: %d\n", atoms_per_unit_cell); printf("Total number of atoms: %d\n", atom->Nlocal); @@ -257,9 +255,8 @@ int main(int argc, const char *argv[]) { E = getTimeStamp(); double T_accum = E-S; double freq_hz = param.proc_freq * 1.e9; - const double repeats = ATOMS_LOOP_RUNS * NEIGHBORS_LOOP_RUNS; - const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes * repeats); - const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes * repeats) * freq_hz; + const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes); + const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes) * freq_hz; const double cycles_per_neigh = cycles_per_atom / (double)(atoms_per_unit_cell - 1); if(!csv) {