Add ATOMS_LOOP_RUNS option and statistics to stub variant

Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
This commit is contained in:
Rafael Ravedutti 2021-10-12 22:39:54 +02:00
parent 55d346510e
commit 43ba28e130
7 changed files with 137 additions and 99 deletions

View File

@ -25,6 +25,10 @@ ifneq ($(ASM_SYNTAX), ATT)
ASFLAGS += -masm=intel ASFLAGS += -masm=intel
endif endif
ifneq ($(ATOMS_LOOP_RUNS),)
DEFINES += -DATOMS_LOOP_RUNS=$(ATOMS_LOOP_RUNS)
endif
ifneq ($(NEIGHBORS_LOOP_RUNS),) ifneq ($(NEIGHBORS_LOOP_RUNS),)
DEFINES += -DNEIGHBORS_LOOP_RUNS=$(NEIGHBORS_LOOP_RUNS) DEFINES += -DNEIGHBORS_LOOP_RUNS=$(NEIGHBORS_LOOP_RUNS)
endif endif

View File

@ -140,85 +140,96 @@ double computeForce(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *sta
INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs); INDEX_TRACE_NATOMS(Nlocal, atom->Nghost, neighbor->maxneighs);
double S = getTimeStamp(); double S = getTimeStamp();
LIKWID_MARKER_START("force"); LIKWID_MARKER_START("force");
#pragma omp parallel for
for(int i = 0; i < Nlocal; i++) {
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
int numneighs = neighbor->numneigh[i];
MD_FLOAT xtmp = atom_x(i);
MD_FLOAT ytmp = atom_y(i);
MD_FLOAT ztmp = atom_z(i);
MD_FLOAT fix = 0;
MD_FLOAT fiy = 0;
MD_FLOAT fiz = 0;
MEM_TRACE(atom_x(i), 'R'); #if VARIANT == stub && defined(ATOMS_LOOP_RUNS) && ATOMS_LOOP_RUNS > 1
MEM_TRACE(atom_y(i), 'R'); #define REPEAT_ATOMS_LOOP
MEM_TRACE(atom_z(i), 'R'); for(int na = 0; na < (first_exec ? 1 : ATOMS_LOOP_RUNS); na++) {
INDEX_TRACE_ATOM(i); #endif
#ifdef EXPLICIT_TYPES #pragma omp parallel for
const int type_i = atom->type[i]; for(int i = 0; i < Nlocal; i++) {
MEM_TRACE(atom->type(i), 'R'); neighs = &neighbor->neighbors[i * neighbor->maxneighs];
#endif int numneighs = neighbor->numneigh[i];
MD_FLOAT xtmp = atom_x(i);
MD_FLOAT ytmp = atom_y(i);
MD_FLOAT ztmp = atom_z(i);
MD_FLOAT fix = 0;
MD_FLOAT fiy = 0;
MD_FLOAT fiz = 0;
#if VARIANT == stub && defined(NEIGHBORS_LOOP_RUNS) && NEIGHBORS_LOOP_RUNS > 1 MEM_TRACE(atom_x(i), 'R');
#define REPEAT_NEIGHBORS_LOOP MEM_TRACE(atom_y(i), 'R');
int nmax = first_exec ? 1 : NEIGHBORS_LOOP_RUNS; MEM_TRACE(atom_z(i), 'R');
for(int n = 0; n < nmax; n++) { INDEX_TRACE_ATOM(i);
#endif
//DIST_TRACE_SORT(neighs, numneighs); #ifdef EXPLICIT_TYPES
INDEX_TRACE(neighs, numneighs); const int type_i = atom->type[i];
//DIST_TRACE(neighs, numneighs); MEM_TRACE(atom->type(i), 'R');
#endif
for(int k = 0; k < numneighs; k++) { #if VARIANT == stub && defined(NEIGHBORS_LOOP_RUNS) && NEIGHBORS_LOOP_RUNS > 1
int j = neighs[k]; #define REPEAT_NEIGHBORS_LOOP
MD_FLOAT delx = xtmp - atom_x(j); int nmax = first_exec ? 1 : NEIGHBORS_LOOP_RUNS;
MD_FLOAT dely = ytmp - atom_y(j); for(int nn = 0; nn < (first_exec ? 1 : NEIGHBORS_LOOP_RUNS); nn++) {
MD_FLOAT delz = ztmp - atom_z(j); #endif
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
MEM_TRACE(neighs[k], 'R'); //DIST_TRACE_SORT(neighs, numneighs);
MEM_TRACE(atom_x(j), 'R'); INDEX_TRACE(neighs, numneighs);
MEM_TRACE(atom_y(j), 'R'); //DIST_TRACE(neighs, numneighs);
MEM_TRACE(atom_z(j), 'R');
#ifdef EXPLICIT_TYPES for(int k = 0; k < numneighs; k++) {
const int type_j = atom->type[j]; int j = neighs[k];
const int type_ij = type_i * atom->ntypes + type_j; MD_FLOAT delx = xtmp - atom_x(j);
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij]; MD_FLOAT dely = ytmp - atom_y(j);
const MD_FLOAT sigma6 = atom->sigma6[type_ij]; MD_FLOAT delz = ztmp - atom_z(j);
const MD_FLOAT epsilon = atom->epsilon[type_ij]; MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
MEM_TRACE(atom->type(j), 'R');
#endif
if(rsq < cutforcesq) { MEM_TRACE(neighs[k], 'R');
MD_FLOAT sr2 = 1.0 / rsq; MEM_TRACE(atom_x(j), 'R');
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6; MEM_TRACE(atom_y(j), 'R');
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon; MEM_TRACE(atom_z(j), 'R');
fix += delx * force;
fiy += dely * force; #ifdef EXPLICIT_TYPES
fiz += delz * force; const int type_j = atom->type[j];
const int type_ij = type_i * atom->ntypes + type_j;
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
const MD_FLOAT sigma6 = atom->sigma6[type_ij];
const MD_FLOAT epsilon = atom->epsilon[type_ij];
MEM_TRACE(atom->type(j), 'R');
#endif
if(rsq < cutforcesq) {
MD_FLOAT sr2 = 1.0 / rsq;
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
fix += delx * force;
fiy += dely * force;
fiz += delz * force;
}
} }
#ifdef REPEAT_NEIGHBORS_LOOP
} }
#endif
#ifdef REPEAT_NEIGHBORS_LOOP fx[i] += fix;
fy[i] += fiy;
fz[i] += fiz;
addStat(stats->total_force_neighs, numneighs);
addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
MEM_TRACE(fx[i], 'R');
MEM_TRACE(fx[i], 'W');
MEM_TRACE(fy[i], 'R');
MEM_TRACE(fy[i], 'W');
MEM_TRACE(fz[i], 'R');
MEM_TRACE(fz[i], 'W');
} }
#endif
fx[i] += fix; #ifdef REPEAT_ATOMS_LOOP
fy[i] += fiy;
fz[i] += fiz;
addStat(stats->total_force_neighs, numneighs);
addStat(stats->total_force_iters, (numneighs + VECTOR_WIDTH - 1) / VECTOR_WIDTH);
MEM_TRACE(fx[i], 'R');
MEM_TRACE(fx[i], 'W');
MEM_TRACE(fy[i], 'R');
MEM_TRACE(fy[i], 'W');
MEM_TRACE(fz[i], 'R');
MEM_TRACE(fz[i], 'W');
} }
#endif
LIKWID_MARKER_STOP("force"); LIKWID_MARKER_STOP("force");
double E = getTimeStamp(); double E = getTimeStamp();

View File

@ -31,6 +31,7 @@ typedef struct {
} Stats; } Stats;
void initStats(Stats *s); void initStats(Stats *s);
void displayStatistics(Atom *atom, Parameter *param, Stats *stats, double *timer);
#ifdef COMPUTE_STATS #ifdef COMPUTE_STATS
# define addStat(stat, value) stat += value; # define addStat(stat, value) stat += value;

11
src/includes/timers.h Normal file
View File

@ -0,0 +1,11 @@
#ifndef __TIMERS_H_
#define __TIMERS_H_
typedef enum {
TOTAL = 0,
NEIGH,
FORCE,
NUMTIMER
} timertype;
#endif

View File

@ -8,15 +8,17 @@
#include <neighbor.h> #include <neighbor.h>
#include <parameter.h> #include <parameter.h>
#include <atom.h> #include <atom.h>
#include <stats.h>
#include <thermo.h> #include <thermo.h>
#include <pbc.h> #include <pbc.h>
#include <timers.h>
#define HLINE "----------------------------------------------------------------------------\n" #define HLINE "----------------------------------------------------------------------------\n"
#define LATTICE_DISTANCE 10.0 #define LATTICE_DISTANCE 10.0
#define NEIGH_DISTANCE 1.0 #define NEIGH_DISTANCE 1.0
extern double computeForce(Parameter*, Atom*, Neighbor*, int, int); extern double computeForce(Parameter*, Atom*, Neighbor*, Stats*, int, int);
void init(Parameter *param) { void init(Parameter *param) {
param->epsilon = 1.0; param->epsilon = 1.0;
@ -37,6 +39,7 @@ void init(Parameter *param) {
param->nstat = 100; param->nstat = 100;
param->temp = 1.44; param->temp = 1.44;
param->every = 20; param->every = 20;
param->proc_freq = 0.0;
} }
// Show debug messages // Show debug messages
@ -56,10 +59,10 @@ int main(int argc, const char *argv[]) {
Atom atom_data; Atom atom_data;
Atom *atom = (Atom *)(&atom_data); Atom *atom = (Atom *)(&atom_data);
Neighbor neighbor; Neighbor neighbor;
Stats stats;
Parameter param; Parameter param;
int atoms_per_unit_cell = 8; int atoms_per_unit_cell = 8;
int csv = 0; int csv = 0;
double freq = 0.0;
LIKWID_MARKER_INIT; LIKWID_MARKER_INIT;
LIKWID_MARKER_REGISTER("force"); LIKWID_MARKER_REGISTER("force");
@ -95,7 +98,7 @@ int main(int argc, const char *argv[]) {
} }
if((strcmp(argv[i], "-f") == 0)) if((strcmp(argv[i], "-f") == 0))
{ {
freq = atof(argv[++i]) * 1.E9; param.proc_freq = atof(argv[++i]);
continue; continue;
} }
if((strcmp(argv[i], "-csv") == 0)) if((strcmp(argv[i], "-csv") == 0))
@ -123,6 +126,7 @@ int main(int argc, const char *argv[]) {
DEBUG("Initializing atoms...\n"); DEBUG("Initializing atoms...\n");
initAtom(atom); initAtom(atom);
initStats(&stats);
#ifdef EXPLICIT_TYPES #ifdef EXPLICIT_TYPES
atom->ntypes = param.ntypes; atom->ntypes = param.ntypes;
@ -191,6 +195,7 @@ int main(int argc, const char *argv[]) {
if(!csv) { if(!csv) {
printf("Number of timesteps: %d\n", param.ntimes); printf("Number of timesteps: %d\n", param.ntimes);
printf("Number of times to compute the atoms loop: %d\n", ATOMS_LOOP_RUNS);
printf("Number of times to compute the neighbors loop: %d\n", NEIGHBORS_LOOP_RUNS); printf("Number of times to compute the neighbors loop: %d\n", NEIGHBORS_LOOP_RUNS);
printf("System size (unit cells): %dx%dx%d\n", param.nx, param.ny, param.nz); printf("System size (unit cells): %dx%dx%d\n", param.nx, param.ny, param.nz);
printf("Atoms per unit cell: %d\n", atoms_per_unit_cell); printf("Atoms per unit cell: %d\n", atoms_per_unit_cell);
@ -207,41 +212,46 @@ int main(int argc, const char *argv[]) {
DEBUG("Building neighbor lists...\n"); DEBUG("Building neighbor lists...\n");
buildNeighbor(atom, &neighbor); buildNeighbor(atom, &neighbor);
DEBUG("Computing forces...\n"); DEBUG("Computing forces...\n");
computeForce(&param, atom, &neighbor, 1, 0); computeForce(&param, atom, &neighbor, &stats, 1, 1);
double S, E; double S, E;
S = getTimeStamp(); S = getTimeStamp();
for(int i = 0; i < param.ntimes; i++) { for(int i = 0; i < param.ntimes; i++) {
computeForce(&param, atom, &neighbor, 0, i + 1); computeForce(&param, atom, &neighbor, &stats, 0, i + 1);
} }
E = getTimeStamp(); E = getTimeStamp();
double T_accum = E-S; double T_accum = E-S;
const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes * NEIGHBORS_LOOP_RUNS); double freq_hz = param.proc_freq * 1.e9;
const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes * NEIGHBORS_LOOP_RUNS) * freq; const double repeats = ATOMS_LOOP_RUNS * NEIGHBORS_LOOP_RUNS;
const double atoms_updates_per_sec = (double)(atom->Nlocal) / T_accum * (double)(param.ntimes * repeats);
const double cycles_per_atom = T_accum / (double)(atom->Nlocal) / (double)(param.ntimes * repeats) * freq_hz;
const double cycles_per_neigh = cycles_per_atom / (double)(atoms_per_unit_cell - 1); const double cycles_per_neigh = cycles_per_atom / (double)(atoms_per_unit_cell - 1);
if(!csv) { if(!csv) {
printf("Total time: %.4f, Mega atom updates/s: %.4f\n", T_accum, atoms_updates_per_sec / 1.E6); printf("Total time: %.4f, Mega atom updates/s: %.4f\n", T_accum, atoms_updates_per_sec / 1.e6);
if(freq > 0.0) { if(param.proc_freq > 0.0) {
printf("Cycles per atom: %.4f, Cycles per neighbor: %.4f\n", cycles_per_atom, cycles_per_neigh); printf("Cycles per atom: %.4f, Cycles per neighbor: %.4f\n", cycles_per_atom, cycles_per_neigh);
} }
} else { } else {
printf("steps,unit cells,atoms/unit cell,total atoms,total vol.(kB),atoms vol.(kB),neigh vol.(kB),time(s),atom upds/s(M)"); printf("steps,unit cells,atoms/unit cell,total atoms,total vol.(kB),atoms vol.(kB),neigh vol.(kB),time(s),atom upds/s(M)");
if(freq > 0.0) { if(param.proc_freq > 0.0) {
printf(",cy/atom,cy/neigh"); printf(",cy/atom,cy/neigh");
} }
printf("\n"); printf("\n");
printf("%d,%dx%dx%d,%d,%d,%.4f,%.4f,%.4f,%.4f,%.4f", printf("%d,%dx%dx%d,%d,%d,%.4f,%.4f,%.4f,%.4f,%.4f",
param.ntimes, param.nx, param.ny, param.nz, atoms_per_unit_cell, atom->Nlocal, param.ntimes, param.nx, param.ny, param.nz, atoms_per_unit_cell, atom->Nlocal,
estim_volume / 1.E3, estim_atom_volume / 1.E3, estim_neighbors_volume / 1.E3, T_accum, atoms_updates_per_sec / 1.E6); estim_volume / 1.e3, estim_atom_volume / 1.e3, estim_neighbors_volume / 1.e3, T_accum, atoms_updates_per_sec / 1.e6);
if(freq > 0.0) { if(param.proc_freq > 0.0) {
printf(",%.4f,%.4f", cycles_per_atom, cycles_per_neigh); printf(",%.4f,%.4f", cycles_per_atom, cycles_per_neigh);
} }
printf("\n"); printf("\n");
} }
double timer[NUMTIMER];
timer[FORCE] = T_accum;
displayStatistics(atom, &param, &stats, timer);
LIKWID_MARKER_CLOSE; LIKWID_MARKER_CLOSE;
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }

View File

@ -38,16 +38,10 @@
#include <stats.h> #include <stats.h>
#include <thermo.h> #include <thermo.h>
#include <pbc.h> #include <pbc.h>
#include <timers.h>
#define HLINE "----------------------------------------------------------------------------\n" #define HLINE "----------------------------------------------------------------------------\n"
typedef enum {
TOTAL = 0,
NEIGH,
FORCE,
NUMTIMER
} timertype;
extern double computeForce(Parameter*, Atom*, Neighbor*, Stats*, int, int); extern double computeForce(Parameter*, Atom*, Neighbor*, Stats*, int, int);
void init(Parameter *param) void init(Parameter *param)
@ -257,21 +251,7 @@ int main (int argc, char** argv)
printf(HLINE); printf(HLINE);
printf("Performance: %.2f million atom updates per second\n", printf("Performance: %.2f million atom updates per second\n",
1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]); 1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
displayStatistics(&atom, &param, &stats, timer);
#ifdef COMPUTE_STATS
double force_useful_volume = 1e-9 * ( (double)(atom.Nlocal * (param.ntimes + 1)) * (sizeof(MD_FLOAT) * 6 + sizeof(int)) +
(double)(stats.total_force_neighs) * (sizeof(MD_FLOAT) * 3 + sizeof(int)) );
#ifdef EXPLICIT_TYPES
force_useful_volume += 1e-9 * (double)((atom.Nlocal * (param.ntimes + 1)) + stats.total_force_neighs) * sizeof(int);
#endif
printf("Statistics:\n");
printf("\tVector width: %d, Processor frequency: %.4f GHz\n", VECTOR_WIDTH, param.proc_freq);
printf("\tTotal number of computed pair interactions: %lld\n", stats.total_force_neighs);
printf("\tTotal number of most SIMD iterations: %lld\n", stats.total_force_iters);
printf("\tUseful read data volume for force computation: %.2fGB\n", force_useful_volume);
printf("\tCycles/SIMD iteration: %.4f\n", timer[FORCE] * param.proc_freq * 1e9 / stats.total_force_iters);
#endif
LIKWID_MARKER_CLOSE; LIKWID_MARKER_CLOSE;
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }

View File

@ -1,6 +1,27 @@
#include <stdio.h>
#include <atom.h>
#include <parameter.h>
#include <stats.h> #include <stats.h>
#include <timers.h>
void initStats(Stats *s) { void initStats(Stats *s) {
s->total_force_neighs = 0; s->total_force_neighs = 0;
s->total_force_iters = 0; s->total_force_iters = 0;
} }
void displayStatistics(Atom *atom, Parameter *param, Stats *stats, double *timer) {
#ifdef COMPUTE_STATS
double force_useful_volume = 1e-9 * ( (double)(atom->Nlocal * (param->ntimes + 1)) * (sizeof(MD_FLOAT) * 6 + sizeof(int)) +
(double)(stats->total_force_neighs) * (sizeof(MD_FLOAT) * 3 + sizeof(int)) );
#ifdef EXPLICIT_TYPES
force_useful_volume += 1e-9 * (double)((atom.Nlocal * (param.ntimes + 1)) + stats.total_force_neighs) * sizeof(int);
#endif
printf("Statistics:\n");
printf("\tVector width: %d, Processor frequency: %.4f GHz\n", VECTOR_WIDTH, param->proc_freq);
printf("\tTotal number of computed pair interactions: %lld\n", stats->total_force_neighs);
printf("\tTotal number of most SIMD iterations: %lld\n", stats->total_force_iters);
printf("\tUseful read data volume for force computation: %.2fGB\n", force_useful_volume);
printf("\tCycles/SIMD iteration: %.4f\n", timer[FORCE] * param->proc_freq * 1e9 / stats->total_force_iters);
#endif
}