From 7a61cbbabfcfb78f9e04196e7398ff968ead0720 Mon Sep 17 00:00:00 2001 From: Martin Bauernfeind Date: Tue, 19 Jul 2022 20:38:11 +0200 Subject: [PATCH] Instrumented the reneighbor function in order to obtain runtimes of its compontents --- src/includes/neighbor.h | 2 +- src/includes/timers.h | 3 ++- src/main.c | 24 +++++++++++++++--------- src/neighbor.cu | 7 ++++++- 4 files changed, 24 insertions(+), 12 deletions(-) diff --git a/src/includes/neighbor.h b/src/includes/neighbor.h index 0af49a5..c9a1b07 100644 --- a/src/includes/neighbor.h +++ b/src/includes/neighbor.h @@ -54,5 +54,5 @@ extern void binatoms(Atom*); extern void buildNeighbor(Atom*, Neighbor*); extern void sortAtom(Atom*); extern void binatoms_cuda(Atom*, Binning*, int*, Neighbor_params*, const int); -extern void buildNeighbor_cuda(Atom*, Neighbor*, Atom*, Neighbor*, const int); +extern void buildNeighbor_cuda(Atom*, Neighbor*, Atom*, Neighbor*, const int, double*); #endif diff --git a/src/includes/timers.h b/src/includes/timers.h index 9938f50..16c720e 100644 --- a/src/includes/timers.h +++ b/src/includes/timers.h @@ -8,7 +8,8 @@ typedef enum { NEIGH_UPDATE_ATOMS_PBC, NEIGH_SETUP_PBC, NEIGH_UPDATE_PBC, - NEIGH_BUILD_NEIGHBOR, + NEIGH_BINATOMS, + NEIGH_BUILD_LISTS, NUMTIMER } timertype; diff --git a/src/main.c b/src/main.c index 8e0c2bf..23e5c82 100644 --- a/src/main.c +++ b/src/main.c @@ -124,7 +124,8 @@ double setup( Atom *c_atom, Neighbor *c_neighbor, Stats *stats, - const int num_threads_per_block) + const int num_threads_per_block, + double* timers) { if(param->force_field == FF_EAM) { initEam(eam, param); } double S, E; @@ -145,7 +146,7 @@ double setup( setupPbc(atom, param); initCudaAtom(atom, neighbor, c_atom, c_neighbor); updatePbc_cuda(atom, param, c_atom, true, num_threads_per_block); - buildNeighbor_cuda(atom, neighbor, c_atom, c_neighbor, num_threads_per_block); + buildNeighbor_cuda(atom, neighbor, c_atom, c_neighbor, num_threads_per_block, timers); E = getTimeStamp(); @@ -179,11 +180,11 @@ double reneighbour( timers[NEIGH_UPDATE_PBC] += afterEvent - beforeEvent; beforeEvent = afterEvent; //sortAtom(atom); - buildNeighbor_cuda(atom, neighbor, c_atom, c_neighbor, num_threads_per_block); + buildNeighbor_cuda(atom, neighbor, c_atom, c_neighbor, num_threads_per_block, timers); LIKWID_MARKER_STOP("reneighbour"); E = getTimeStamp(); afterEvent = E; - timers[NEIGH_BUILD_NEIGHBOR] += afterEvent - beforeEvent; + timers[NEIGH_BUILD_LISTS] += afterEvent - beforeEvent; return E-S; } @@ -331,7 +332,7 @@ int main(int argc, char** argv) // this should be multiple of 32 as operations are performed at the level of warps const int num_threads_per_block = get_num_threads(); - setup(¶m, &eam, &atom, &neighbor, &c_atom, &c_neighbor, &stats, num_threads_per_block); + setup(¶m, &eam, &atom, &neighbor, &c_atom, &c_neighbor, &stats, num_threads_per_block, (double*) &timer); computeThermo(0, ¶m, &atom); if(param.force_field == FF_EAM) { computeForceEam(&eam, ¶m, &atom, &neighbor, &stats, 1, 0); @@ -349,7 +350,8 @@ int main(int argc, char** argv) timer[NEIGH_UPDATE_ATOMS_PBC] = 0.0; timer[NEIGH_SETUP_PBC] = 0.0; timer[NEIGH_UPDATE_PBC] = 0.0; - timer[NEIGH_BUILD_NEIGHBOR] = 0.0; + timer[NEIGH_BINATOMS] = 0.0; + timer[NEIGH_BUILD_LISTS] = 0.0; if(param.vtk_file != NULL) { write_atoms_to_vtk_file(param.vtk_file, &atom, 0); @@ -362,7 +364,7 @@ int main(int argc, char** argv) cuda_initial_integrate(doReneighbour, ¶m, &atom, &c_atom, num_threads_per_block); if(doReneighbour) { - timer[NEIGH] += reneighbour(¶m, &atom, &neighbor, &c_atom, &c_neighbor, num_threads_per_block, &timer); + timer[NEIGH] += reneighbour(¶m, &atom, &neighbor, &c_atom, &c_neighbor, num_threads_per_block, (double*) &timer); } else { double before = getTimeStamp(); updatePbc_cuda(&atom, ¶m, &c_atom, false, num_threads_per_block); @@ -392,6 +394,7 @@ int main(int argc, char** argv) } } + timer[NEIGH_BUILD_LISTS] -= timer[NEIGH_BINATOMS]; timer[TOTAL] = getTimeStamp() - timer[TOTAL]; computeThermo(-1, ¶m, &atom); @@ -405,11 +408,14 @@ int main(int argc, char** argv) #endif printf(HLINE); printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes); - printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs NEIGH_TIMERS: UPD_AT: %.2fs SETUP_PBC %.2fs UPDATE_PBC %.2fs BUILD_NEIGHBOR %.2fs\n", - timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH], timer[NEIGH_UPDATE_ATOMS_PBC], timer[NEIGH_SETUP_PBC], timer[NEIGH_UPDATE_PBC], timer[NEIGH_BUILD_NEIGHBOR]); + printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs NEIGH_TIMERS: UPD_AT: %.2fs SETUP_PBC %.2fs UPDATE_PBC %.2fs BINATOMS %.2fs BUILD_NEIGHBOR %.2fs\n", + timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH], timer[NEIGH_UPDATE_ATOMS_PBC], timer[NEIGH_SETUP_PBC], timer[NEIGH_UPDATE_PBC], timer[NEIGH_BINATOMS], timer[NEIGH_BUILD_LISTS]); printf(HLINE); printf("Performance: %.2f million atom updates per second\n", 1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]); + double atomUpdatesTotal = (double) atom.Natoms * param.ntimes; + double atomNeighUpdatesTotal = (double) atom.Natoms * param.ntimes / param.every; + printf("Neighbor_perf in millions per sec: updateAtomsPbc: %.2f setupPbc: %.2f updatePbc: %.2f binAtoms: %.2f buildNeighbor_wo_binning: %.2f\n", 1e-6 * atomNeighUpdatesTotal / timer[NEIGH_UPDATE_ATOMS_PBC], 1e-6 * atomNeighUpdatesTotal / timer[NEIGH_SETUP_PBC], 1e-6 * atomUpdatesTotal / timer[NEIGH_UPDATE_PBC], 1e-6 * atomNeighUpdatesTotal / timer[NEIGH_BINATOMS], 1e-6 * atomNeighUpdatesTotal / timer[NEIGH_BUILD_LISTS]); #ifdef COMPUTE_STATS displayStatistics(&atom, ¶m, &stats, timer); #endif diff --git a/src/neighbor.cu b/src/neighbor.cu index 7727100..6017125 100644 --- a/src/neighbor.cu +++ b/src/neighbor.cu @@ -33,6 +33,8 @@ extern "C" { #include #include #include +#include +#include #define SMALL 1.0e-6 #define FACTOR 0.999 @@ -609,7 +611,7 @@ void binatoms_cuda(Atom* c_atom, Binning* c_binning, int* c_resize_needed, Neigh checkCUDAError( "DeviceSync sort_bin_contents kernel", cudaDeviceSynchronize() ); } -void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor, const int num_threads_per_block) +void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor, const int num_threads_per_block, double* timers) { int nall = atom->Nlocal + atom->Nghost; c_neighbor->maxneighs = neighbor->maxneighs; @@ -652,7 +654,10 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor * } /* bin local & ghost atoms */ + double beforeBinning = getTimeStamp(); binatoms_cuda(c_atom, &c_binning, c_resize_needed, &np, num_threads_per_block); + double afterBinning = getTimeStamp(); + timers[NEIGH_BINATOMS] += afterBinning - beforeBinning; if(c_new_maxneighs == NULL){ checkCUDAError("c_new_maxneighs malloc", cudaMalloc((void**)&c_new_maxneighs, sizeof(int) ));