diff --git a/Makefile b/Makefile index e1cfa29..da24a29 100644 --- a/Makefile +++ b/Makefile @@ -25,6 +25,10 @@ ifneq ($(ASM_SYNTAX), ATT) ASFLAGS += -masm=intel endif +ifneq ($(ATOMS_LOOP_RUNS),) + DEFINES += -DATOMS_LOOP_RUNS=$(ATOMS_LOOP_RUNS) +endif + ifneq ($(NEIGHBORS_LOOP_RUNS),) DEFINES += -DNEIGHBORS_LOOP_RUNS=$(NEIGHBORS_LOOP_RUNS) endif diff --git a/src/force.c b/src/force.c index 9dee844..540d947 100644 --- a/src/force.c +++ b/src/force.c @@ -140,85 +140,96 @@ double computeForce(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *sta INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs); double S = getTimeStamp(); LIKWID_MARKER_START("force"); - #pragma omp parallel for - for(int i = 0; i < Nlocal; i++) { - neighs = &neighbor->neighbors[i * neighbor->maxneighs]; - int numneighs = neighbor->numneigh[i]; - MD_FLOAT xtmp = atom_x(i); - MD_FLOAT ytmp = atom_y(i); - MD_FLOAT ztmp = atom_z(i); - MD_FLOAT fix = 0; - MD_FLOAT fiy = 0; - MD_FLOAT fiz = 0; - MEM_TRACE(atom_x(i), 'R'); - MEM_TRACE(atom_y(i), 'R'); - MEM_TRACE(atom_z(i), 'R'); - INDEX_TRACE_ATOM(i); + #if VARIANT == stub && defined(ATOMS_LOOP_RUNS) && ATOMS_LOOP_RUNS > 1 + #define REPEAT_ATOMS_LOOP + for(int na = 0; na < (first_exec ? 1 : ATOMS_LOOP_RUNS); na++) { + #endif - #ifdef EXPLICIT_TYPES - const int type_i = atom->type[i]; - MEM_TRACE(atom->type(i), 'R'); - #endif + #pragma omp parallel for + for(int i = 0; i < Nlocal; i++) { + neighs = &neighbor->neighbors[i * neighbor->maxneighs]; + int numneighs = neighbor->numneigh[i]; + MD_FLOAT xtmp = atom_x(i); + MD_FLOAT ytmp = atom_y(i); + MD_FLOAT ztmp = atom_z(i); + MD_FLOAT fix = 0; + MD_FLOAT fiy = 0; + MD_FLOAT fiz = 0; - #if VARIANT == stub && defined(NEIGHBORS_LOOP_RUNS) && NEIGHBORS_LOOP_RUNS > 1 - #define REPEAT_NEIGHBORS_LOOP - int nmax = first_exec ? 1 : NEIGHBORS_LOOP_RUNS; - for(int n = 0; n < nmax; n++) { - #endif + MEM_TRACE(atom_x(i), 'R'); + MEM_TRACE(atom_y(i), 'R'); + MEM_TRACE(atom_z(i), 'R'); + INDEX_TRACE_ATOM(i); - //DIST_TRACE_SORT(neighs, numneighs); - INDEX_TRACE(neighs, numneighs); - //DIST_TRACE(neighs, numneighs); + #ifdef EXPLICIT_TYPES + const int type_i = atom->type[i]; + MEM_TRACE(atom->type(i), 'R'); + #endif - for(int k = 0; k < numneighs; k++) { - int j = neighs[k]; - MD_FLOAT delx = xtmp - atom_x(j); - MD_FLOAT dely = ytmp - atom_y(j); - MD_FLOAT delz = ztmp - atom_z(j); - MD_FLOAT rsq = delx * delx + dely * dely + delz * delz; + #if VARIANT == stub && defined(NEIGHBORS_LOOP_RUNS) && NEIGHBORS_LOOP_RUNS > 1 + #define REPEAT_NEIGHBORS_LOOP + int nmax = first_exec ? 1 : NEIGHBORS_LOOP_RUNS; + for(int nn = 0; nn < (first_exec ? 1 : NEIGHBORS_LOOP_RUNS); nn++) { + #endif - MEM_TRACE(neighs[k], 'R'); - MEM_TRACE(atom_x(j), 'R'); - MEM_TRACE(atom_y(j), 'R'); - MEM_TRACE(atom_z(j), 'R'); + //DIST_TRACE_SORT(neighs, numneighs); + INDEX_TRACE(neighs, numneighs); + //DIST_TRACE(neighs, numneighs); - #ifdef EXPLICIT_TYPES - const int type_j = atom->type[j]; - const int type_ij = type_i * atom->ntypes + type_j; - const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij]; - const MD_FLOAT sigma6 = atom->sigma6[type_ij]; - const MD_FLOAT epsilon = atom->epsilon[type_ij]; - MEM_TRACE(atom->type(j), 'R'); - #endif + for(int k = 0; k < numneighs; k++) { + int j = neighs[k]; + MD_FLOAT delx = xtmp - atom_x(j); + MD_FLOAT dely = ytmp - atom_y(j); + MD_FLOAT delz = ztmp - atom_z(j); + MD_FLOAT rsq = delx * delx + dely * dely + delz * delz; - if(rsq < cutforcesq) { - MD_FLOAT sr2 = 1.0 / rsq; - MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6; - MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon; - fix += delx * force; - fiy += dely * force; - fiz += delz * force; + MEM_TRACE(neighs[k], 'R'); + MEM_TRACE(atom_x(j), 'R'); + MEM_TRACE(atom_y(j), 'R'); + MEM_TRACE(atom_z(j), 'R'); + + #ifdef EXPLICIT_TYPES + const int type_j = atom->type[j]; + const int type_ij = type_i * atom->ntypes + type_j; + const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij]; + const MD_FLOAT sigma6 = atom->sigma6[type_ij]; + const MD_FLOAT epsilon = atom->epsilon[type_ij]; + MEM_TRACE(atom->type(j), 'R'); + #endif + + if(rsq < cutforcesq) { + MD_FLOAT sr2 = 1.0 / rsq; + MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6; + MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon; + fix += delx * force; + fiy += dely * force; + fiz += delz * force; + } } + + #ifdef REPEAT_NEIGHBORS_LOOP } + #endif - #ifdef REPEAT_NEIGHBORS_LOOP + fx[i] += fix; + fy[i] += fiy; + fz[i] += fiz; + + addStat(stats->total_force_neighs, numneighs); + addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH); + MEM_TRACE(fx[i], 'R'); + MEM_TRACE(fx[i], 'W'); + MEM_TRACE(fy[i], 'R'); + MEM_TRACE(fy[i], 'W'); + MEM_TRACE(fz[i], 'R'); + MEM_TRACE(fz[i], 'W'); } - #endif - fx[i] += fix; - fy[i] += fiy; - fz[i] += fiz; - - addStat(stats->total_force_neighs, numneighs); - addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH); - MEM_TRACE(fx[i], 'R'); - MEM_TRACE(fx[i], 'W'); - MEM_TRACE(fy[i], 'R'); - MEM_TRACE(fy[i], 'W'); - MEM_TRACE(fz[i], 'R'); - MEM_TRACE(fz[i], 'W'); + #ifdef REPEAT_ATOMS_LOOP } + #endif + LIKWID_MARKER_STOP("force"); double E = getTimeStamp(); diff --git a/src/includes/stats.h b/src/includes/stats.h index d0cde3c..231119b 100644 --- a/src/includes/stats.h +++ b/src/includes/stats.h @@ -31,6 +31,7 @@ typedef struct { } Stats; void initStats(Stats *s); +void displayStatistics(Atom *atom, Parameter *param, Stats *stats, double *timer); #ifdef COMPUTE_STATS # define addStat(stat, value) stat += value; diff --git a/src/includes/timers.h b/src/includes/timers.h new file mode 100644 index 0000000..2400c01 --- /dev/null +++ b/src/includes/timers.h @@ -0,0 +1,11 @@ +#ifndef __TIMERS_H_ +#define __TIMERS_H_ + +typedef enum { + TOTAL = 0, + NEIGH, + FORCE, + NUMTIMER +} timertype; + +#endif diff --git a/src/main-stub.c b/src/main-stub.c index a96f92c..72521f2 100644 --- a/src/main-stub.c +++ b/src/main-stub.c @@ -8,15 +8,17 @@ #include #include #include +#include #include #include +#include #define HLINE "----------------------------------------------------------------------------\n" #define LATTICE_DISTANCE 10.0 #define NEIGH_DISTANCE 1.0 -extern double computeForce(Parameter*, Atom*, Neighbor*, int, int); +extern double computeForce(Parameter*, Atom*, Neighbor*, Stats*, int, int); void init(Parameter *param) { param->epsilon = 1.0; @@ -37,6 +39,7 @@ void init(Parameter *param) { param->nstat = 100; param->temp = 1.44; param->every = 20; + param->proc_freq = 0.0; } // Show debug messages @@ -56,10 +59,10 @@ int main(int argc, const char *argv[]) { Atom atom_data; Atom *atom = (Atom *)(&atom_data); Neighbor neighbor; + Stats stats; Parameter param; int atoms_per_unit_cell = 8; int csv = 0; - double freq = 0.0; LIKWID_MARKER_INIT; LIKWID_MARKER_REGISTER("force"); @@ -95,7 +98,7 @@ int main(int argc, const char *argv[]) { } if((strcmp(argv[i], "-f") == 0)) { - freq = atof(argv[++i]) * 1.E9; + param.proc_freq = atof(argv[++i]); continue; } if((strcmp(argv[i], "-csv") == 0)) @@ -123,6 +126,7 @@ int main(int argc, const char *argv[]) { DEBUG("Initializing atoms...\n"); initAtom(atom); + initStats(&stats); #ifdef EXPLICIT_TYPES atom->ntypes = param.ntypes; @@ -191,6 +195,7 @@ int main(int argc, const char *argv[]) { if(!csv) { printf("Number of timesteps: %d\n", param.ntimes); + printf("Number of times to compute the atoms loop: %d\n", ATOMS_LOOP_RUNS); printf("Number of times to compute the neighbors loop: %d\n", NEIGHBORS_LOOP_RUNS); printf("System size (unit cells): %dx%dx%d\n", param.nx, param.ny, param.nz); printf("Atoms per unit cell: %d\n", atoms_per_unit_cell); @@ -207,41 +212,46 @@ int main(int argc, const char *argv[]) { DEBUG("Building neighbor lists...\n"); buildNeighbor(atom, &neighbor); DEBUG("Computing forces...\n"); - computeForce(¶m, atom, &neighbor, 1, 0); + computeForce(¶m, atom, &neighbor, &stats, 1, 1); double S, E; S = getTimeStamp(); for(int i = 0; i < param.ntimes; i++) { - computeForce(¶m, atom, &neighbor, 0, i + 1); + computeForce(¶m, atom, &neighbor, &stats, 0, i + 1); } E = getTimeStamp(); double T_accum = E-S; - const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes * NEIGHBORS_LOOP_RUNS); - const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes * NEIGHBORS_LOOP_RUNS) * freq; + double freq_hz = param.proc_freq * 1.e9; + const double repeats = ATOMS_LOOP_RUNS * NEIGHBORS_LOOP_RUNS; + const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes * repeats); + const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes * repeats) * freq_hz; const double cycles_per_neigh = cycles_per_atom / (double)(atoms_per_unit_cell - 1); if(!csv) { - printf("Total time: %.4f, Mega atom updates/s: %.4f\n", T_accum, atoms_updates_per_sec / 1.E6); - if(freq > 0.0) { + printf("Total time: %.4f, Mega atom updates/s: %.4f\n", T_accum, atoms_updates_per_sec / 1.e6); + if(param.proc_freq > 0.0) { printf("Cycles per atom: %.4f, Cycles per neighbor: %.4f\n", cycles_per_atom, cycles_per_neigh); } } else { printf("steps,unit cells,atoms/unit cell,total atoms,total vol.(kB),atoms vol.(kB),neigh vol.(kB),time(s),atom upds/s(M)"); - if(freq > 0.0) { + if(param.proc_freq > 0.0) { printf(",cy/atom,cy/neigh"); } printf("\n"); printf("%d,%dx%dx%d,%d,%d,%.4f,%.4f,%.4f,%.4f,%.4f", param.ntimes, param.nx, param.ny, param.nz, atoms_per_unit_cell, atom->Nlocal, - estim_volume / 1.E3, estim_atom_volume / 1.E3, estim_neighbors_volume / 1.E3, T_accum, atoms_updates_per_sec / 1.E6); + estim_volume / 1.e3, estim_atom_volume / 1.e3, estim_neighbors_volume / 1.e3, T_accum, atoms_updates_per_sec / 1.e6); - if(freq > 0.0) { + if(param.proc_freq > 0.0) { printf(",%.4f,%.4f", cycles_per_atom, cycles_per_neigh); } printf("\n"); } + double timer[NUMTIMER]; + timer[FORCE] = T_accum; + displayStatistics(atom, ¶m, &stats, timer); LIKWID_MARKER_CLOSE; return EXIT_SUCCESS; } diff --git a/src/main.c b/src/main.c index ff2b2ad..bff4422 100644 --- a/src/main.c +++ b/src/main.c @@ -38,16 +38,10 @@ #include #include #include +#include #define HLINE "----------------------------------------------------------------------------\n" -typedef enum { - TOTAL = 0, - NEIGH, - FORCE, - NUMTIMER -} timertype; - extern double computeForce(Parameter*, Atom*, Neighbor*, Stats*, int, int); void init(Parameter *param) @@ -257,21 +251,7 @@ int main (int argc, char** argv) printf(HLINE); printf("Performance: %.2f million atom updates per second\n", 1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]); - -#ifdef COMPUTE_STATS - double force_useful_volume = 1e-9 * ( (double)(atom.Nlocal * (param.ntimes + 1)) * (sizeof(MD_FLOAT) * 6 + sizeof(int)) + - (double)(stats.total_force_neighs) * (sizeof(MD_FLOAT) * 3 + sizeof(int)) ); -#ifdef EXPLICIT_TYPES - force_useful_volume += 1e-9 * (double)((atom.Nlocal * (param.ntimes + 1)) + stats.total_force_neighs) * sizeof(int); -#endif - printf("Statistics:\n"); - printf("\tVector width: %d, Processor frequency: %.4f GHz\n", VECTOR_WIDTH, param.proc_freq); - printf("\tTotal number of computed pair interactions: %lld\n", stats.total_force_neighs); - printf("\tTotal number of most SIMD iterations: %lld\n", stats.total_force_iters); - printf("\tUseful read data volume for force computation: %.2fGB\n", force_useful_volume); - printf("\tCycles/SIMD iteration: %.4f\n", timer[FORCE] * param.proc_freq * 1e9 / stats.total_force_iters); -#endif - + displayStatistics(&atom, ¶m, &stats, timer); LIKWID_MARKER_CLOSE; return EXIT_SUCCESS; } diff --git a/src/stats.c b/src/stats.c index bae23ee..751544e 100644 --- a/src/stats.c +++ b/src/stats.c @@ -1,6 +1,27 @@ +#include + +#include +#include #include +#include void initStats(Stats *s) { s->total_force_neighs = 0; s->total_force_iters = 0; } + +void displayStatistics(Atom *atom, Parameter *param, Stats *stats, double *timer) { +#ifdef COMPUTE_STATS + double force_useful_volume = 1e-9 * ( (double)(atom->Nlocal * (param->ntimes + 1)) * (sizeof(MD_FLOAT) * 6 + sizeof(int)) + + (double)(stats->total_force_neighs) * (sizeof(MD_FLOAT) * 3 + sizeof(int)) ); +#ifdef EXPLICIT_TYPES + force_useful_volume += 1e-9 * (double)((atom.Nlocal * (param.ntimes + 1)) + stats.total_force_neighs) * sizeof(int); +#endif + printf("Statistics:\n"); + printf("\tVector width: %d, Processor frequency: %.4f GHz\n", VECTOR_WIDTH, param->proc_freq); + printf("\tTotal number of computed pair interactions: %lld\n", stats->total_force_neighs); + printf("\tTotal number of most SIMD iterations: %lld\n", stats->total_force_iters); + printf("\tUseful read data volume for force computation: %.2fGB\n", force_useful_volume); + printf("\tCycles/SIMD iteration: %.4f\n", timer[FORCE] * param->proc_freq * 1e9 / stats->total_force_iters); +#endif +}