diff --git a/Makefile b/Makefile index bc0e037..2e05f9a 100644 --- a/Makefile +++ b/Makefile @@ -68,5 +68,5 @@ clean: distclean: clean @echo "===> DIST CLEAN" - @rm -f $(TARGET) + @rm -f $(TARGET)* @rm -f tags diff --git a/config.mk b/config.mk index afbd2ca..0d02951 100644 --- a/config.mk +++ b/config.mk @@ -1,9 +1,9 @@ # Supported: GCC, CLANG, ICC -TAG ?= GCC +TAG ?= ICC # SP or DP DATA_TYPE ?= DP # AOS or SOA -DATA_LAYOUT ?= SOA +DATA_LAYOUT ?= AOS #Feature options OPTIONS += -DALIGNMENT=64 -DLIKWID_PERFMON diff --git a/include_GCC.mk b/include_GCC.mk index 3ec307a..954ac5d 100644 --- a/include_GCC.mk +++ b/include_GCC.mk @@ -10,8 +10,6 @@ ANSI_CFLAGS += -Wextra CFLAGS = -O3 -march=znver1 -ffast-math -funroll-loops # -fopenmp ASFLAGS = -masm=intel LFLAGS = -DEFINES = -D_GNU_SOURCE -#INCLUDES = -#LIBS = -lm +DEFINES = -D_GNU_SOURCE -DLIKWID_PERFMON INCLUDES = $(LIKWID_INC) LIBS = -lm $(LIKWID_LIB) -llikwid diff --git a/include_ICC.mk b/include_ICC.mk index ddb61ac..37b017f 100644 --- a/include_ICC.mk +++ b/include_ICC.mk @@ -3,15 +3,15 @@ LINKER = $(CC) OPENMP = #-qopenmp PROFILE = #-profile-functions -g -pg -# OPTS = -fast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE) + OPTS = -Ofast -xCORE-AVX512 -qopt-zmm-usage=high $(PROFILE) #OPTS = -fast -xCORE-AVX2 $(PROFILE) #OPTS = -fast -xAVX $(PROFILE) #OPTS = -fast -xSSE4.2 $(PROFILE) #OPTS = -fast -no-vec $(PROFILE) -OPTS = -fast -xHost $(PROFILE) +#OPTS = -fast -xHost $(PROFILE) CFLAGS = $(PROFILE) -restrict $(OPENMP) $(OPTS) -ASFLAGS = -masm=intel +ASFLAGS = #-masm=intel LFLAGS = $(PROFILE) $(OPTS) $(OPENMP) -DEFINES = -D_GNU_SOURCE # -DALIGNMENT=64 -DLIKWID_PERFMON -DPRECISION=1 +DEFINES = -D_GNU_SOURCE #-DLIKWID_PERFMON INCLUDES = #$(LIKWID_INC) LIBS = -lm #$(LIKWID_LIB) -llikwid diff --git a/src/force.c b/src/force.c index e1cb39d..c7c6121 100644 --- a/src/force.c +++ b/src/force.c @@ -40,9 +40,8 @@ double computeForce( MD_FLOAT sigma6 = param->sigma6; MD_FLOAT epsilon = param->epsilon; MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz; - MD_FLOAT S, E; + double S, E; - S = getTimeStamp(); for(int i = 0; i < Nlocal; i++) { fx[i] = 0.0; fy[i] = 0.0; @@ -50,49 +49,47 @@ double computeForce( } if(profile) { - LIKWID_MARKER_START("force"); + // LIKWID_MARKER_START("force"); } - for(int t = 0; t < ntimes; t++) { #pragma omp parallel for - for(int i = 0; i < Nlocal; i++) { - neighs = &neighbor->neighbors[i * neighbor->maxneighs]; - int numneighs = neighbor->numneigh[i]; - MD_FLOAT xtmp = atom_x(i); - MD_FLOAT ytmp = atom_y(i); - MD_FLOAT ztmp = atom_z(i); + for(int i = 0; i < Nlocal; i++) { + neighs = &neighbor->neighbors[i * neighbor->maxneighs]; + int numneighs = neighbor->numneigh[i]; + MD_FLOAT xtmp = atom_x(i); + MD_FLOAT ytmp = atom_y(i); + MD_FLOAT ztmp = atom_z(i); + MD_FLOAT fix = 0; + MD_FLOAT fiy = 0; + MD_FLOAT fiz = 0; - MD_FLOAT fix = 0; - MD_FLOAT fiy = 0; - MD_FLOAT fiz = 0; +// printf("%d: %d\n", i, numneighs); - for(int k = 0; k < numneighs; k++) { - int j = neighs[k]; - MD_FLOAT delx = xtmp - atom_x(j); - MD_FLOAT dely = ytmp - atom_y(j); - MD_FLOAT delz = ztmp - atom_z(j); - MD_FLOAT rsq = delx * delx + dely * dely + delz * delz; + for(int k = 0; k < numneighs; k++) { + int j = neighs[k]; + MD_FLOAT delx = xtmp - atom_x(j); + MD_FLOAT dely = ytmp - atom_y(j); + MD_FLOAT delz = ztmp - atom_z(j); + MD_FLOAT rsq = delx * delx + dely * dely + delz * delz; - if(rsq < cutforcesq) { - MD_FLOAT sr2 = 1.0 / rsq; - MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6; - MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon; - fix += delx * force; - fiy += dely * force; - fiz += delz * force; - } + if(rsq < cutforcesq) { + MD_FLOAT sr2 = 1.0 / rsq; + MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6; + MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon; + fix += delx * force; + fiy += dely * force; + fiz += delz * force; } - - fx[i] += fix; - fy[i] += fiy; - fz[i] += fiz; } + + fx[i] += fix; + fy[i] += fiy; + fz[i] += fiz; } if(profile) { - LIKWID_MARKER_STOP("force"); + // LIKWID_MARKER_STOP("force"); } - E = getTimeStamp(); - return E-S; + return 0.0; } diff --git a/src/main-stub.c b/src/main-stub.c index 0f4d037..a92a8f4 100644 --- a/src/main-stub.c +++ b/src/main-stub.c @@ -158,11 +158,17 @@ int main(int argc, const char *argv[]) { } } - const double estim_volume = (double)(atom->Nlocal * 6 * sizeof(MD_FLOAT) + atom->Nlocal * (atoms_per_unit_cell - 1 + 2) * sizeof(int)) / 1000.0; + const double estim_volume = (double) + (atom->Nlocal * 6 * sizeof(MD_FLOAT) + + atom->Nlocal * (atoms_per_unit_cell - 1 + 2) * sizeof(int)) / 1000.0; printf("System size (unit cells): %dx%dx%d\n", param.nx, param.ny, param.nz); printf("Atoms per unit cell: %d\n", atoms_per_unit_cell); printf("Total number of atoms: %d\n", atom->Nlocal); - printf("Estimated memory volume (kB): %.4f\n", estim_volume); + printf("Estimated total data volume (kB): %.4f\n", estim_volume ); + printf("Estimated atom data volume (kB): %.4f\n", + (double)(atom->Nlocal * 3 * sizeof(MD_FLOAT) / 1000.0)); + printf("Estimated neighborlist data volume (kB): %.4f\n", + (double)(atom->Nlocal * (atoms_per_unit_cell - 1 + 2) * sizeof(int)) / 1000.0); DEBUG("Initializing neighbor lists...\n"); initNeighbor(&neighbor, ¶m); @@ -171,10 +177,20 @@ int main(int argc, const char *argv[]) { DEBUG("Building neighbor lists...\n"); buildNeighbor(atom, &neighbor); DEBUG("Computing forces...\n"); - computeForce(¶m, atom, &neighbor, 0, 1); + computeForce(¶m, atom, &neighbor, 0); - double T_accum = computeForce(¶m, atom, &neighbor, 1, param.ntimes); - printf("Total time: %.4f, Time/force: %.4f\n", T_accum, T_accum / param.ntimes); + double S, E; + S = getTimeStamp(); + LIKWID_MARKER_START("force"); + for(int i = 0; i < param.ntimes; i++) { + computeForce(¶m, atom, &neighbor, 1); + } + LIKWID_MARKER_STOP("force"); + E = getTimeStamp(); + double T_accum = E-S; + + printf("Total time: %.4f, Mega atom updates/s: %.4f\n", + T_accum, atom->Nlocal * param.ntimes/T_accum/1.E6); LIKWID_MARKER_CLOSE; return EXIT_SUCCESS; }