Move src directory to lammps

Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
Remove likwid and architecture-specific compilation flags
2022-08-04 17:25:31 +02:00 · 2022-08-04 17:09:17 +02:00 · 2022-07-22 13:55:27 +02:00 · 2022-07-20 23:04:22 +02:00 · 2022-07-20 22:43:57 +02:00 · 2022-07-19 20:38:11 +02:00
29 changed files with 139 additions and 56 deletions
--- a/include_NVCC.mk
+++ b/include_NVCC.mk
@@ -7,9 +7,10 @@ ANSI_CFLAGS += -pedantic
 ANSI_CFLAGS += -Wextra

 # CFLAGS   = -O0 -g  -std=c99 -fargument-noalias
-CFLAGS   = -O3 -g -arch=sm_61 # -fopenmp
+#CFLAGS   = -O3 -g -arch=sm_61 # -fopenmp
+CFLAGS   = -O3 -g # -fopenmp
 ASFLAGS  =  -masm=intel
 LFLAGS   =
-DEFINES  = -D_GNU_SOURCE -DLIKWID_PERFMON
+DEFINES  = -D_GNU_SOURCE #-DLIKWID_PERFMON
 INCLUDES = $(LIKWID_INC)
-LIBS     = -lm $(LIKWID_LIB) -llikwid -lcuda -lcudart
+LIBS     = -lm $(LIKWID_LIB) -lcuda -lcudart #-llikwid
--- a/lammps/allocate.c
+++ b/lammps/allocate.c
--- a/lammps/atom.c
+++ b/lammps/atom.c
--- a/lammps/eam_utils.c
+++ b/lammps/eam_utils.c
--- a/lammps/force-tracing.c
+++ b/lammps/force-tracing.c
--- a/lammps/force.cu
+++ b/lammps/force.cu
@@ -153,7 +153,6 @@ void cuda_initial_integrate(bool doReneighbour, Parameter *param, Atom *atom, At

    if(doReneighbour) {
        checkCUDAError( "InitialIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom->vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
-        checkCUDAError( "InitialIntegrate: position memcpy", cudaMemcpy(atom->x, c_atom->x, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
    }
 }

--- a/lammps/force_eam.c
+++ b/lammps/force_eam.c
--- a/lammps/includes/allocate.h
+++ b/lammps/includes/allocate.h
--- a/lammps/includes/atom.h
+++ b/lammps/includes/atom.h
--- a/lammps/includes/eam.h
+++ b/lammps/includes/eam.h
--- a/lammps/includes/likwid-marker.h
+++ b/lammps/includes/likwid-marker.h
--- a/lammps/includes/neighbor.h
+++ b/lammps/includes/neighbor.h
@@ -54,5 +54,5 @@ extern void binatoms(Atom*);
 extern void buildNeighbor(Atom*, Neighbor*);
 extern void sortAtom(Atom*);
 extern void binatoms_cuda(Atom*, Binning*, int*, Neighbor_params*, const int);
-extern void buildNeighbor_cuda(Atom*, Neighbor*, Atom*, Neighbor*, const int);
+extern void buildNeighbor_cuda(Atom*, Neighbor*, Atom*, Neighbor*, const int, double*);
 #endif
--- a/lammps/includes/parameter.h
+++ b/lammps/includes/parameter.h
--- a/lammps/includes/pbc.h
+++ b/lammps/includes/pbc.h
@@ -29,5 +29,6 @@ extern void initPbc(Atom*);
 extern void updatePbc(Atom*, Parameter*);
 extern void updatePbc_cuda(Atom*, Parameter*, Atom*, bool, const int);
 extern void updateAtomsPbc(Atom*, Parameter*);
+extern void updateAtomsPbc_cuda(Atom*, Parameter*, Atom*, const int);
 extern void setupPbc(Atom*, Parameter*);
 #endif
--- a/lammps/includes/stats.h
+++ b/lammps/includes/stats.h
--- a/lammps/includes/thermo.h
+++ b/lammps/includes/thermo.h
--- a/lammps/includes/timers.h
+++ b/lammps/includes/timers.h
@@ -5,6 +5,11 @@ typedef enum {
    TOTAL = 0,
    NEIGH,
    FORCE,
+    NEIGH_UPDATE_ATOMS_PBC,
+    NEIGH_SETUP_PBC,
+    NEIGH_UPDATE_PBC,
+    NEIGH_BINATOMS,
+    NEIGH_BUILD_LISTS,
    NUMTIMER
 } timertype;

--- a/lammps/includes/timing.h
+++ b/lammps/includes/timing.h
--- a/lammps/includes/util.h
+++ b/lammps/includes/util.h
--- a/lammps/includes/vtk.h
+++ b/lammps/includes/vtk.h
--- a/lammps/main-stub.c
+++ b/lammps/main-stub.c
--- a/lammps/main.c
+++ b/lammps/main.c
@@ -124,7 +124,8 @@ double setup(
        Atom *c_atom,
        Neighbor *c_neighbor,
        Stats *stats,
-        const int num_threads_per_block)
+        const int num_threads_per_block,
+	double* timers)
 {
    if(param->force_field == FF_EAM) { initEam(eam, param); }
    double S, E;
@@ -145,7 +146,7 @@ double setup(
    setupPbc(atom, param);
    initCudaAtom(atom, neighbor, c_atom, c_neighbor);
    updatePbc_cuda(atom, param, c_atom, true, num_threads_per_block);
-    buildNeighbor_cuda(atom, neighbor, c_atom, c_neighbor, num_threads_per_block);
+    buildNeighbor_cuda(atom, neighbor, c_atom, c_neighbor, num_threads_per_block, timers);
    E = getTimeStamp();


@@ -158,19 +159,32 @@ double reneighbour(
        Neighbor *neighbor,
        Atom *c_atom,
        Neighbor *c_neighbor,
-        const int num_threads_per_block)
+        const int num_threads_per_block,
+	double* timers)
 {
-    double S, E;
+    double S, E, beforeEvent, afterEvent;

    S = getTimeStamp();
+    beforeEvent = S;
    LIKWID_MARKER_START("reneighbour");
-    updateAtomsPbc(atom, param);
+    updateAtomsPbc_cuda(atom, param, c_atom, num_threads_per_block);
+    afterEvent = getTimeStamp();
+    timers[NEIGH_UPDATE_ATOMS_PBC] += afterEvent - beforeEvent;
+    beforeEvent = afterEvent;
    setupPbc(atom, param);
+    afterEvent = getTimeStamp();
+    timers[NEIGH_SETUP_PBC] += afterEvent - beforeEvent;
+    beforeEvent = afterEvent;
    updatePbc_cuda(atom, param, c_atom, true, num_threads_per_block);
+    afterEvent = getTimeStamp();
+    timers[NEIGH_UPDATE_PBC] += afterEvent - beforeEvent;
+    beforeEvent = afterEvent;
    //sortAtom(atom);
-    buildNeighbor_cuda(atom, neighbor, c_atom, c_neighbor, num_threads_per_block);
+    buildNeighbor_cuda(atom, neighbor, c_atom, c_neighbor, num_threads_per_block, timers);
    LIKWID_MARKER_STOP("reneighbour");
    E = getTimeStamp();
+    afterEvent = E;
+    timers[NEIGH_BUILD_LISTS] += afterEvent - beforeEvent;

    return E-S;
 }
@@ -318,7 +332,7 @@ int main(int argc, char** argv)
    // this should be multiple of 32 as operations are performed at the level of warps
    const int num_threads_per_block = get_num_threads();

-    setup(&param, &eam, &atom, &neighbor, &c_atom, &c_neighbor, &stats, num_threads_per_block);
+    setup(&param, &eam, &atom, &neighbor, &c_atom, &c_neighbor, &stats, num_threads_per_block, (double*) &timer);
    computeThermo(0, &param, &atom);
    if(param.force_field == FF_EAM) {
        computeForceEam(&eam, &param, &atom, &neighbor, &stats, 1, 0);
@@ -333,6 +347,11 @@ int main(int argc, char** argv)
    timer[FORCE] = 0.0;
    timer[NEIGH] = 0.0;
    timer[TOTAL] = getTimeStamp();
+    timer[NEIGH_UPDATE_ATOMS_PBC] = 0.0;
+    timer[NEIGH_SETUP_PBC] = 0.0;
+    timer[NEIGH_UPDATE_PBC] = 0.0;
+    timer[NEIGH_BINATOMS] = 0.0;
+    timer[NEIGH_BUILD_LISTS] = 0.0;

    if(param.vtk_file != NULL) {
        write_atoms_to_vtk_file(param.vtk_file, &atom, 0);
@@ -345,9 +364,12 @@ int main(int argc, char** argv)
        cuda_initial_integrate(doReneighbour, &param, &atom, &c_atom, num_threads_per_block);

        if(doReneighbour) {
-            timer[NEIGH] += reneighbour(&param, &atom, &neighbor, &c_atom, &c_neighbor, num_threads_per_block);
+            timer[NEIGH] += reneighbour(&param, &atom, &neighbor, &c_atom, &c_neighbor, num_threads_per_block, (double*) &timer);
        } else {
+	    double before = getTimeStamp();
            updatePbc_cuda(&atom, &param, &c_atom, false, num_threads_per_block);
+	    double after = getTimeStamp();
+	    timer[NEIGH_UPDATE_PBC] += after - before;
        }

        if(param.force_field == FF_EAM) {
@@ -372,6 +394,7 @@ int main(int argc, char** argv)
        }
    }

+    timer[NEIGH_BUILD_LISTS] -= timer[NEIGH_BINATOMS];
    timer[TOTAL] = getTimeStamp() - timer[TOTAL];
    computeThermo(-1, &param, &atom);

@@ -385,11 +408,15 @@ int main(int argc, char** argv)
 #endif
    printf(HLINE);
    printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes);
-    printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
-            timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH]);
+    printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs   NEIGH_TIMERS: UPD_AT: %.2fs SETUP_PBC %.2fs UPDATE_PBC %.2fs BINATOMS %.2fs BUILD_NEIGHBOR %.2fs\n",
+            timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH], timer[NEIGH_UPDATE_ATOMS_PBC], timer[NEIGH_SETUP_PBC], timer[NEIGH_UPDATE_PBC], timer[NEIGH_BINATOMS], timer[NEIGH_BUILD_LISTS]);
    printf(HLINE);
    printf("Performance: %.2f million atom updates per second\n",
            1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
+    double atomUpdatesTotal = (double) atom.Natoms * param.ntimes;
+    printf("Force_perf in millions per sec: %.2f\n", 1e-6 * atomUpdatesTotal / timer[FORCE]);
+    double atomNeighUpdatesTotal = (double) atom.Natoms * param.ntimes / param.every;
+    printf("Neighbor_perf in millions per sec: updateAtomsPbc: %.2f setupPbc: %.2f updatePbc: %.2f binAtoms: %.2f buildNeighbor_wo_binning: %.2f\n", 1e-6 * atomNeighUpdatesTotal / timer[NEIGH_UPDATE_ATOMS_PBC], 1e-6 * atomNeighUpdatesTotal / timer[NEIGH_SETUP_PBC], 1e-6 * atomUpdatesTotal / timer[NEIGH_UPDATE_PBC], 1e-6 * atomNeighUpdatesTotal / timer[NEIGH_BINATOMS], 1e-6 * atomNeighUpdatesTotal / timer[NEIGH_BUILD_LISTS]);
 #ifdef COMPUTE_STATS
    displayStatistics(&atom, &param, &stats, timer);
 #endif
--- a/lammps/neighbor.cu
+++ b/lammps/neighbor.cu
@@ -33,6 +33,8 @@ extern "C" {
 #include <parameter.h>
 #include <allocate.h>
 #include <atom.h>
+#include <timing.h>
+#include <timers.h>

 #define SMALL 1.0e-6
 #define FACTOR 0.999
@@ -194,6 +196,17 @@ static int nstencil;      // # of bins in stencil
 static int* stencil;      // stencil list of bin offsets
 static MD_FLOAT binsizex, binsizey, binsizez;

+static int* c_stencil = NULL;
+static int* c_resize_needed = NULL;
+static int* c_new_maxneighs = NULL;
+static Binning c_binning{
+        .bincount = NULL,
+        .bins = NULL,
+        .mbins = 0,
+        .atoms_per_bin = 0
+};
+
+
 static int coord2bin(MD_FLOAT, MD_FLOAT , MD_FLOAT);
 static MD_FLOAT bindist(int, int, int);

@@ -506,21 +519,21 @@ void sortAtom(Atom* atom) {
    }

 #ifdef AOS
-    double* new_x = (double*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
+    MD_FLOAT* new_x = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT) * 3);

-    double* new_vx = (double*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
+    MD_FLOAT* new_vx = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
 #else
-    double* new_x = (double*) malloc(Nmax * sizeof(MD_FLOAT));
-    double* new_y = (double*) malloc(Nmax * sizeof(MD_FLOAT));
-    double* new_z = (double*) malloc(Nmax * sizeof(MD_FLOAT));
+    MD_FLOAT* new_x = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
+    MD_FLOAT* new_y = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
+    MD_FLOAT* new_z = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));

-    double* new_vx = (double*) malloc(Nmax * sizeof(MD_FLOAT));
-    double* new_vy = (double*) malloc(Nmax * sizeof(MD_FLOAT));
-    double* new_vz = (double*) malloc(Nmax * sizeof(MD_FLOAT));
+    MD_FLOAT* new_vx = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
+    MD_FLOAT* new_vy = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
+    MD_FLOAT* new_vz = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
 #endif

-    double* old_x = atom->x; double* old_y = atom->y; double* old_z = atom->z;
-    double* old_vx = atom->vx; double* old_vy = atom->vy; double* old_vz = atom->vz;
+    MD_FLOAT* old_x = atom->x; MD_FLOAT* old_y = atom->y; MD_FLOAT* old_z = atom->z;
+    MD_FLOAT* old_vx = atom->vx; MD_FLOAT* old_vy = atom->vy; MD_FLOAT* old_vz = atom->vz;

    for(int mybin = 0; mybin<mbins; mybin++) {
        int start = mybin>0?binpos[mybin-1]:0;
@@ -568,12 +581,6 @@ void binatoms_cuda(Atom* c_atom, Binning* c_binning, int* c_resize_needed, Neigh
 {
    int nall = c_atom->Nlocal + c_atom->Nghost;
    int resize = 1;
-    if(c_binning->bincount == NULL){
-        checkCUDAError("binatoms_cuda c_binning->bincount malloc", cudaMalloc((void**)(&c_binning->bincount), c_binning->mbins * sizeof(int)) );
-    }
-    if(c_binning->bins == NULL){
-        checkCUDAError("binatoms_cuda c_binning->bins malloc", cudaMalloc((void**)(&c_binning->bins), c_binning->mbins * c_binning->atoms_per_bin * sizeof(int)) );
-    }

    const int num_blocks = ceil((float)nall / (float)threads_per_block);

@@ -585,10 +592,10 @@ void binatoms_cuda(Atom* c_atom, Binning* c_binning, int* c_resize_needed, Neigh
        /*binatoms_kernel(Atom a, int* bincount, int* bins, int c_binning->atoms_per_bin, Neighbor_params np, int *resize_needed) */
        binatoms_kernel<<<num_blocks, threads_per_block>>>(*c_atom, c_binning->bincount, c_binning->bins, c_binning->atoms_per_bin, *np, c_resize_needed);

-	checkCUDAError( "PeekAtLastError binatoms kernel", cudaPeekAtLastError() );
-	checkCUDAError( "DeviceSync binatoms kernel", cudaDeviceSynchronize() );
+	    checkCUDAError( "PeekAtLastError binatoms kernel", cudaPeekAtLastError() );
+	    checkCUDAError( "DeviceSync binatoms kernel", cudaDeviceSynchronize() );
        
-	checkCUDAError("binatoms_cuda c_resize_needed memcpy back", cudaMemcpy(&resize, c_resize_needed, sizeof(int), cudaMemcpyDeviceToHost) );
+	    checkCUDAError("binatoms_cuda c_resize_needed memcpy back", cudaMemcpy(&resize, c_resize_needed, sizeof(int), cudaMemcpyDeviceToHost) );

        if(resize) {
            cudaFree(c_binning->bins);
@@ -604,23 +611,25 @@ void binatoms_cuda(Atom* c_atom, Binning* c_binning, int* c_resize_needed, Neigh
    checkCUDAError( "DeviceSync sort_bin_contents kernel", cudaDeviceSynchronize() );
 }

-void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor, const int num_threads_per_block)
+void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor, const int num_threads_per_block, double* timers)
 {
    int nall = atom->Nlocal + atom->Nghost;
    c_neighbor->maxneighs = neighbor->maxneighs;

    cudaProfilerStart();
    /* upload stencil */
-    int* c_stencil;
-    // TODO move this to be done once at the start
-    checkCUDAError( "buildNeighbor c_n_stencil malloc", cudaMalloc((void**)&c_stencil, nstencil * sizeof(int)) );
-    checkCUDAError( "buildNeighbor c_n_stencil memcpy", cudaMemcpy(c_stencil, stencil, nstencil * sizeof(int), cudaMemcpyHostToDevice ));
+    // TODO move all of this initialization into its own method
+    if(c_stencil == NULL){
+        checkCUDAError( "buildNeighbor c_n_stencil malloc", cudaMalloc((void**)&c_stencil, nstencil * sizeof(int)) );
+        checkCUDAError( "buildNeighbor c_n_stencil memcpy", cudaMemcpy(c_stencil, stencil, nstencil * sizeof(int), cudaMemcpyHostToDevice ));
+    }

-    Binning c_binning;
-    c_binning.mbins = mbins;
-    c_binning.atoms_per_bin = atoms_per_bin;
-    checkCUDAError( "buildNeighbor c_binning->bincount malloc", cudaMalloc((void**)&(c_binning.bincount), mbins * sizeof(int)) );
-    checkCUDAError( "buidlNeighbor c_binning->bins malloc", cudaMalloc((void**)&(c_binning.bins), c_binning.mbins * c_binning.atoms_per_bin * sizeof(int)) );
+    if(c_binning.mbins == 0){
+        c_binning.mbins = mbins;
+        c_binning.atoms_per_bin = atoms_per_bin;
+        checkCUDAError( "buildNeighbor c_binning->bincount malloc", cudaMalloc((void**)&(c_binning.bincount), c_binning.mbins * sizeof(int)) );
+        checkCUDAError( "buidlNeighbor c_binning->bins malloc", cudaMalloc((void**)&(c_binning.bins), c_binning.mbins * c_binning.atoms_per_bin * sizeof(int)) );
+    }

    Neighbor_params np{
            .xprd = xprd,
@@ -640,14 +649,19 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *
            .mbinz = mbinz
    };

-    int* c_resize_needed;
-    checkCUDAError("buildNeighbor c_resize_needed malloc", cudaMalloc((void**)&c_resize_needed, sizeof(int)) );
-    
-    /* bin local & ghost atoms */
-    binatoms_cuda(c_atom, &c_binning, c_resize_needed, &np, num_threads_per_block);
+    if(c_resize_needed == NULL){
+        checkCUDAError("buildNeighbor c_resize_needed malloc", cudaMalloc((void**)&c_resize_needed, sizeof(int)) );
+    }

-    int* c_new_maxneighs;
-    checkCUDAError("c_new_maxneighs malloc", cudaMalloc((void**)&c_new_maxneighs, sizeof(int) ));
+    /* bin local & ghost atoms */
+    double beforeBinning = getTimeStamp();
+    binatoms_cuda(c_atom, &c_binning, c_resize_needed, &np, num_threads_per_block);
+    double afterBinning = getTimeStamp();
+    timers[NEIGH_BINATOMS] += afterBinning - beforeBinning;
+
+    if(c_new_maxneighs == NULL){
+        checkCUDAError("c_new_maxneighs malloc", cudaMalloc((void**)&c_new_maxneighs, sizeof(int) ));
+    }

    int resize = 1;
    
@@ -701,10 +715,5 @@ void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *
    neighbor->maxneighs = c_neighbor->maxneighs;

    cudaProfilerStop();
-
-    cudaFree(c_new_maxneighs);
-    cudaFree(c_stencil);
-    cudaFree(c_binning.bincount);
-    cudaFree(c_binning.bins);
 }
 }
--- a/lammps/pbc.cu
+++ b/lammps/pbc.cu
@@ -33,6 +33,32 @@ extern "C" {

 }

+__global__ void computeAtomsPbcUpdate(Atom a, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd){
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    Atom* atom = &a;
+    if( i >= atom->Nlocal ){
+        return;
+    }
+
+    if (atom_x(i) < 0.0) {
+        atom_x(i) += xprd;
+    } else if (atom_x(i) >= xprd) {
+        atom_x(i) -= xprd;
+    }
+
+    if (atom_y(i) < 0.0) {
+        atom_y(i) += yprd;
+    } else if (atom_y(i) >= yprd) {
+        atom_y(i) -= yprd;
+    }
+
+    if (atom_z(i) < 0.0) {
+        atom_z(i) += zprd;
+    } else if (atom_z(i) >= zprd) {
+        atom_z(i) -= zprd;
+    }
+}
+
 __global__ void computePbcUpdate(Atom a, int* PBCx, int* PBCy, int* PBCz, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd){
    const int i = blockIdx.x * blockDim.x + threadIdx.x;
    const int Nghost = a.Nghost;
@@ -163,6 +189,21 @@ void updateAtomsPbc(Atom *atom, Parameter *param) {
    }
 }

+void updateAtomsPbc_cuda(Atom* atom, Parameter* param, Atom* c_atom, const int num_threads_per_block){
+    MD_FLOAT xprd = param->xprd;
+    MD_FLOAT yprd = param->yprd;
+    MD_FLOAT zprd = param->zprd;
+
+    const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
+    /*void computeAtomsPbcUpdate(Atom a, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd)*/
+    computeAtomsPbcUpdate<<<num_blocks, num_threads_per_block>>>(*c_atom, xprd, yprd, zprd);
+
+    checkCUDAError( "PeekAtLastError UpdateAtomsPbc", cudaPeekAtLastError() );
+    checkCUDAError( "DeviceSync UpdateAtomsPbc", cudaDeviceSynchronize() );
+
+    checkCUDAError( "updateAtomsPbc position memcpy back", cudaMemcpy(atom->x, c_atom->x, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
+}
+
 /* setup periodic boundary conditions by
 * defining ghost atoms around domain
 * only creates mapping and coordinate corrections
--- a/lammps/stats.c
+++ b/lammps/stats.c
--- a/lammps/thermo.c
+++ b/lammps/thermo.c
--- a/lammps/timing.c
+++ b/lammps/timing.c
--- a/lammps/util.c
+++ b/lammps/util.c
--- a/lammps/vtk.c
+++ b/lammps/vtk.c
Author	SHA1	Message	Date
Rafael Ravedutti	bc7b523979	Move src directory to lammps Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2022-08-04 17:25:31 +02:00
Rafael Ravedutti	eeba125a52	Remove likwid and architecture-specific compilation flags Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>	2022-08-04 17:09:17 +02:00
Martin Bauernfeind	b32254b03f	Changed data types in currently unused sort method to also work with single precision floating numbers	2022-07-22 13:55:27 +02:00
Martin Bauernfeind	4dac820784	Added newline in output to improve formatting	2022-07-20 23:04:22 +02:00
Martin Bauernfeind	fe56c50efd	Added one more output line to output the force kernel throughput	2022-07-20 22:43:57 +02:00
Martin Bauernfeind	7a61cbbabf	Instrumented the reneighbor function in order to obtain runtimes of its compontents	2022-07-19 20:38:11 +02:00
Martin Bauernfeind	176de0525b	Instrumented the reneighbor function with timers (via getTimestamp()) to measure the runtime of its different components/methods	2022-07-17 18:34:17 +02:00
Martin Bauernfeind	7bad7e84b6	Fixed compiler errors	2022-07-13 14:52:37 +02:00
Martin Bauernfeind	fb304f240b	Small changes in buildNeighbor to initialize the bincount list and other arrays only once	2022-07-13 14:42:34 +02:00
Martin Bauernfeind	5a6d1851ed	Ported updateAtomsPbc to cuda and changed the code to use the cuda version from now on	2022-07-13 14:07:19 +02:00