Instrumented the reneighbor function in order to obtain runtimes of its compontents
This commit is contained in:
parent
176de0525b
commit
7a61cbbabf
@ -54,5 +54,5 @@ extern void binatoms(Atom*);
|
|||||||
extern void buildNeighbor(Atom*, Neighbor*);
|
extern void buildNeighbor(Atom*, Neighbor*);
|
||||||
extern void sortAtom(Atom*);
|
extern void sortAtom(Atom*);
|
||||||
extern void binatoms_cuda(Atom*, Binning*, int*, Neighbor_params*, const int);
|
extern void binatoms_cuda(Atom*, Binning*, int*, Neighbor_params*, const int);
|
||||||
extern void buildNeighbor_cuda(Atom*, Neighbor*, Atom*, Neighbor*, const int);
|
extern void buildNeighbor_cuda(Atom*, Neighbor*, Atom*, Neighbor*, const int, double*);
|
||||||
#endif
|
#endif
|
||||||
|
@ -8,7 +8,8 @@ typedef enum {
|
|||||||
NEIGH_UPDATE_ATOMS_PBC,
|
NEIGH_UPDATE_ATOMS_PBC,
|
||||||
NEIGH_SETUP_PBC,
|
NEIGH_SETUP_PBC,
|
||||||
NEIGH_UPDATE_PBC,
|
NEIGH_UPDATE_PBC,
|
||||||
NEIGH_BUILD_NEIGHBOR,
|
NEIGH_BINATOMS,
|
||||||
|
NEIGH_BUILD_LISTS,
|
||||||
NUMTIMER
|
NUMTIMER
|
||||||
} timertype;
|
} timertype;
|
||||||
|
|
||||||
|
24
src/main.c
24
src/main.c
@ -124,7 +124,8 @@ double setup(
|
|||||||
Atom *c_atom,
|
Atom *c_atom,
|
||||||
Neighbor *c_neighbor,
|
Neighbor *c_neighbor,
|
||||||
Stats *stats,
|
Stats *stats,
|
||||||
const int num_threads_per_block)
|
const int num_threads_per_block,
|
||||||
|
double* timers)
|
||||||
{
|
{
|
||||||
if(param->force_field == FF_EAM) { initEam(eam, param); }
|
if(param->force_field == FF_EAM) { initEam(eam, param); }
|
||||||
double S, E;
|
double S, E;
|
||||||
@ -145,7 +146,7 @@ double setup(
|
|||||||
setupPbc(atom, param);
|
setupPbc(atom, param);
|
||||||
initCudaAtom(atom, neighbor, c_atom, c_neighbor);
|
initCudaAtom(atom, neighbor, c_atom, c_neighbor);
|
||||||
updatePbc_cuda(atom, param, c_atom, true, num_threads_per_block);
|
updatePbc_cuda(atom, param, c_atom, true, num_threads_per_block);
|
||||||
buildNeighbor_cuda(atom, neighbor, c_atom, c_neighbor, num_threads_per_block);
|
buildNeighbor_cuda(atom, neighbor, c_atom, c_neighbor, num_threads_per_block, timers);
|
||||||
E = getTimeStamp();
|
E = getTimeStamp();
|
||||||
|
|
||||||
|
|
||||||
@ -179,11 +180,11 @@ double reneighbour(
|
|||||||
timers[NEIGH_UPDATE_PBC] += afterEvent - beforeEvent;
|
timers[NEIGH_UPDATE_PBC] += afterEvent - beforeEvent;
|
||||||
beforeEvent = afterEvent;
|
beforeEvent = afterEvent;
|
||||||
//sortAtom(atom);
|
//sortAtom(atom);
|
||||||
buildNeighbor_cuda(atom, neighbor, c_atom, c_neighbor, num_threads_per_block);
|
buildNeighbor_cuda(atom, neighbor, c_atom, c_neighbor, num_threads_per_block, timers);
|
||||||
LIKWID_MARKER_STOP("reneighbour");
|
LIKWID_MARKER_STOP("reneighbour");
|
||||||
E = getTimeStamp();
|
E = getTimeStamp();
|
||||||
afterEvent = E;
|
afterEvent = E;
|
||||||
timers[NEIGH_BUILD_NEIGHBOR] += afterEvent - beforeEvent;
|
timers[NEIGH_BUILD_LISTS] += afterEvent - beforeEvent;
|
||||||
|
|
||||||
return E-S;
|
return E-S;
|
||||||
}
|
}
|
||||||
@ -331,7 +332,7 @@ int main(int argc, char** argv)
|
|||||||
// this should be multiple of 32 as operations are performed at the level of warps
|
// this should be multiple of 32 as operations are performed at the level of warps
|
||||||
const int num_threads_per_block = get_num_threads();
|
const int num_threads_per_block = get_num_threads();
|
||||||
|
|
||||||
setup(¶m, &eam, &atom, &neighbor, &c_atom, &c_neighbor, &stats, num_threads_per_block);
|
setup(¶m, &eam, &atom, &neighbor, &c_atom, &c_neighbor, &stats, num_threads_per_block, (double*) &timer);
|
||||||
computeThermo(0, ¶m, &atom);
|
computeThermo(0, ¶m, &atom);
|
||||||
if(param.force_field == FF_EAM) {
|
if(param.force_field == FF_EAM) {
|
||||||
computeForceEam(&eam, ¶m, &atom, &neighbor, &stats, 1, 0);
|
computeForceEam(&eam, ¶m, &atom, &neighbor, &stats, 1, 0);
|
||||||
@ -349,7 +350,8 @@ int main(int argc, char** argv)
|
|||||||
timer[NEIGH_UPDATE_ATOMS_PBC] = 0.0;
|
timer[NEIGH_UPDATE_ATOMS_PBC] = 0.0;
|
||||||
timer[NEIGH_SETUP_PBC] = 0.0;
|
timer[NEIGH_SETUP_PBC] = 0.0;
|
||||||
timer[NEIGH_UPDATE_PBC] = 0.0;
|
timer[NEIGH_UPDATE_PBC] = 0.0;
|
||||||
timer[NEIGH_BUILD_NEIGHBOR] = 0.0;
|
timer[NEIGH_BINATOMS] = 0.0;
|
||||||
|
timer[NEIGH_BUILD_LISTS] = 0.0;
|
||||||
|
|
||||||
if(param.vtk_file != NULL) {
|
if(param.vtk_file != NULL) {
|
||||||
write_atoms_to_vtk_file(param.vtk_file, &atom, 0);
|
write_atoms_to_vtk_file(param.vtk_file, &atom, 0);
|
||||||
@ -362,7 +364,7 @@ int main(int argc, char** argv)
|
|||||||
cuda_initial_integrate(doReneighbour, ¶m, &atom, &c_atom, num_threads_per_block);
|
cuda_initial_integrate(doReneighbour, ¶m, &atom, &c_atom, num_threads_per_block);
|
||||||
|
|
||||||
if(doReneighbour) {
|
if(doReneighbour) {
|
||||||
timer[NEIGH] += reneighbour(¶m, &atom, &neighbor, &c_atom, &c_neighbor, num_threads_per_block, &timer);
|
timer[NEIGH] += reneighbour(¶m, &atom, &neighbor, &c_atom, &c_neighbor, num_threads_per_block, (double*) &timer);
|
||||||
} else {
|
} else {
|
||||||
double before = getTimeStamp();
|
double before = getTimeStamp();
|
||||||
updatePbc_cuda(&atom, ¶m, &c_atom, false, num_threads_per_block);
|
updatePbc_cuda(&atom, ¶m, &c_atom, false, num_threads_per_block);
|
||||||
@ -392,6 +394,7 @@ int main(int argc, char** argv)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
timer[NEIGH_BUILD_LISTS] -= timer[NEIGH_BINATOMS];
|
||||||
timer[TOTAL] = getTimeStamp() - timer[TOTAL];
|
timer[TOTAL] = getTimeStamp() - timer[TOTAL];
|
||||||
computeThermo(-1, ¶m, &atom);
|
computeThermo(-1, ¶m, &atom);
|
||||||
|
|
||||||
@ -405,11 +408,14 @@ int main(int argc, char** argv)
|
|||||||
#endif
|
#endif
|
||||||
printf(HLINE);
|
printf(HLINE);
|
||||||
printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes);
|
printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes);
|
||||||
printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs NEIGH_TIMERS: UPD_AT: %.2fs SETUP_PBC %.2fs UPDATE_PBC %.2fs BUILD_NEIGHBOR %.2fs\n",
|
printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs NEIGH_TIMERS: UPD_AT: %.2fs SETUP_PBC %.2fs UPDATE_PBC %.2fs BINATOMS %.2fs BUILD_NEIGHBOR %.2fs\n",
|
||||||
timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH], timer[NEIGH_UPDATE_ATOMS_PBC], timer[NEIGH_SETUP_PBC], timer[NEIGH_UPDATE_PBC], timer[NEIGH_BUILD_NEIGHBOR]);
|
timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH], timer[NEIGH_UPDATE_ATOMS_PBC], timer[NEIGH_SETUP_PBC], timer[NEIGH_UPDATE_PBC], timer[NEIGH_BINATOMS], timer[NEIGH_BUILD_LISTS]);
|
||||||
printf(HLINE);
|
printf(HLINE);
|
||||||
printf("Performance: %.2f million atom updates per second\n",
|
printf("Performance: %.2f million atom updates per second\n",
|
||||||
1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
|
1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
|
||||||
|
double atomUpdatesTotal = (double) atom.Natoms * param.ntimes;
|
||||||
|
double atomNeighUpdatesTotal = (double) atom.Natoms * param.ntimes / param.every;
|
||||||
|
printf("Neighbor_perf in millions per sec: updateAtomsPbc: %.2f setupPbc: %.2f updatePbc: %.2f binAtoms: %.2f buildNeighbor_wo_binning: %.2f\n", 1e-6 * atomNeighUpdatesTotal / timer[NEIGH_UPDATE_ATOMS_PBC], 1e-6 * atomNeighUpdatesTotal / timer[NEIGH_SETUP_PBC], 1e-6 * atomUpdatesTotal / timer[NEIGH_UPDATE_PBC], 1e-6 * atomNeighUpdatesTotal / timer[NEIGH_BINATOMS], 1e-6 * atomNeighUpdatesTotal / timer[NEIGH_BUILD_LISTS]);
|
||||||
#ifdef COMPUTE_STATS
|
#ifdef COMPUTE_STATS
|
||||||
displayStatistics(&atom, ¶m, &stats, timer);
|
displayStatistics(&atom, ¶m, &stats, timer);
|
||||||
#endif
|
#endif
|
||||||
|
@ -33,6 +33,8 @@ extern "C" {
|
|||||||
#include <parameter.h>
|
#include <parameter.h>
|
||||||
#include <allocate.h>
|
#include <allocate.h>
|
||||||
#include <atom.h>
|
#include <atom.h>
|
||||||
|
#include <timing.h>
|
||||||
|
#include <timers.h>
|
||||||
|
|
||||||
#define SMALL 1.0e-6
|
#define SMALL 1.0e-6
|
||||||
#define FACTOR 0.999
|
#define FACTOR 0.999
|
||||||
@ -609,7 +611,7 @@ void binatoms_cuda(Atom* c_atom, Binning* c_binning, int* c_resize_needed, Neigh
|
|||||||
checkCUDAError( "DeviceSync sort_bin_contents kernel", cudaDeviceSynchronize() );
|
checkCUDAError( "DeviceSync sort_bin_contents kernel", cudaDeviceSynchronize() );
|
||||||
}
|
}
|
||||||
|
|
||||||
void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor, const int num_threads_per_block)
|
void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor, const int num_threads_per_block, double* timers)
|
||||||
{
|
{
|
||||||
int nall = atom->Nlocal + atom->Nghost;
|
int nall = atom->Nlocal + atom->Nghost;
|
||||||
c_neighbor->maxneighs = neighbor->maxneighs;
|
c_neighbor->maxneighs = neighbor->maxneighs;
|
||||||
@ -652,7 +654,10 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* bin local & ghost atoms */
|
/* bin local & ghost atoms */
|
||||||
|
double beforeBinning = getTimeStamp();
|
||||||
binatoms_cuda(c_atom, &c_binning, c_resize_needed, &np, num_threads_per_block);
|
binatoms_cuda(c_atom, &c_binning, c_resize_needed, &np, num_threads_per_block);
|
||||||
|
double afterBinning = getTimeStamp();
|
||||||
|
timers[NEIGH_BINATOMS] += afterBinning - beforeBinning;
|
||||||
|
|
||||||
if(c_new_maxneighs == NULL){
|
if(c_new_maxneighs == NULL){
|
||||||
checkCUDAError("c_new_maxneighs malloc", cudaMalloc((void**)&c_new_maxneighs, sizeof(int) ));
|
checkCUDAError("c_new_maxneighs malloc", cudaMalloc((void**)&c_new_maxneighs, sizeof(int) ));
|
||||||
|
Loading…
Reference in New Issue
Block a user