From c70ebce4c1da8db00e90d1fc5b1f8eb4dae63b2c Mon Sep 17 00:00:00 2001 From: Rafael Ravedutti Date: Tue, 8 Nov 2022 18:33:23 +0100 Subject: [PATCH] Integrate GROMACS GPU implementation into master branch Signed-off-by: Rafael Ravedutti --- {lammps => common}/device.c | 18 -- {lammps => common}/includes/device.h | 2 + gromacs/cuda/force_lj.cu | 317 +++++++++++++++++++++++++++ gromacs/includes/atom.h | 50 +++-- gromacs/includes/integrate.h | 56 +++++ gromacs/includes/pbc.h | 6 +- gromacs/main.c | 84 ++++--- gromacs/pbc.c | 2 +- lammps/device_spec.c | 29 +++ 9 files changed, 480 insertions(+), 84 deletions(-) rename {lammps => common}/device.c (60%) rename {lammps => common}/includes/device.h (96%) create mode 100644 gromacs/cuda/force_lj.cu create mode 100644 gromacs/includes/integrate.h create mode 100644 lammps/device_spec.c diff --git a/lammps/device.c b/common/device.c similarity index 60% rename from lammps/device.c rename to common/device.c index a90f8cb..d1d51ef 100644 --- a/lammps/device.c +++ b/common/device.c @@ -58,24 +58,6 @@ void memsetGPU(void *d_ptr, int value, size_t bytesize) { cuda_assert("memsetGPU", cudaMemset(d_ptr, value, bytesize)); } -void initDevice(Atom *atom, Neighbor *neighbor) { - DeviceAtom *d_atom = &(atom->d_atom); - DeviceNeighbor *d_neighbor = &(neighbor->d_neighbor); - - d_atom->epsilon = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes); - d_atom->sigma6 = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes); - d_atom->cutforcesq = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes); - d_neighbor->neighbors = (int *) allocateGPU(sizeof(int) * atom->Nmax * neighbor->maxneighs); - d_neighbor->numneigh = (int *) allocateGPU(sizeof(int) * atom->Nmax); - - memcpyToGPU(d_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3); - memcpyToGPU(d_atom->vx, atom->vx, sizeof(MD_FLOAT) * atom->Nmax * 3); - memcpyToGPU(d_atom->sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes); - memcpyToGPU(d_atom->epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes); - memcpyToGPU(d_atom->cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes); - memcpyToGPU(d_atom->type, atom->type, sizeof(int) * atom->Nmax); -} - #else void initDevice(Atom *atom, Neighbor *neighbor) {} void *allocateGPU(size_t bytesize) { return NULL; } diff --git a/lammps/includes/device.h b/common/includes/device.h similarity index 96% rename from lammps/includes/device.h rename to common/includes/device.h index 3c6d0c4..f28cc62 100644 --- a/lammps/includes/device.h +++ b/common/includes/device.h @@ -4,6 +4,8 @@ * Use of this source code is governed by a LGPL-3.0 * license that can be found in the LICENSE file. */ +#include +//--- #include #include diff --git a/gromacs/cuda/force_lj.cu b/gromacs/cuda/force_lj.cu new file mode 100644 index 0000000..96d3e26 --- /dev/null +++ b/gromacs/cuda/force_lj.cu @@ -0,0 +1,317 @@ +/* + * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg. + * All rights reserved. This file is part of MD-Bench. + * Use of this source code is governed by a LGPL-3.0 + * license that can be found in the LICENSE file. + */ +extern "C" { + +#include +//--- +#include +#include +//--- +#include +//--- +#include +#include +#include +#include +#include +#include +#include + +} + +extern "C" { + MD_FLOAT *cuda_cl_x; + MD_FLOAT *cuda_cl_v; + MD_FLOAT *cuda_cl_f; + int *cuda_neighbors; + int *cuda_numneigh; + int *cuda_natoms; + int *natoms; + int *ngatoms; + int *cuda_border_map; + int *cuda_jclusters_natoms; + MD_FLOAT *cuda_bbminx, *cuda_bbmaxx; + MD_FLOAT *cuda_bbminy, *cuda_bbmaxy; + MD_FLOAT *cuda_bbminz, *cuda_bbmaxz; + int *cuda_PBCx, *cuda_PBCy, *cuda_PBCz; + int isReneighboured; +} + +extern "C" +void initDevice(Atom *atom, Neighbor *neighbor) { + cuda_assert("cudaDeviceSetup", cudaDeviceReset()); + cuda_assert("cudaDeviceSetup", cudaSetDevice(0)); + cuda_cl_x = (MD_FLOAT *) allocateGPU(atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT)); + cuda_cl_v = (MD_FLOAT *) allocateGPU(atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT)); + cuda_cl_f = (MD_FLOAT *) allocateGPU(atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT)); + cuda_natoms = (int *) allocateGPU(atom->Nclusters_max * sizeof(int)); + cuda_jclusters_natoms = (int *) allocateGPU(atom->Nclusters_max * sizeof(int)); + cuda_border_map = (int *) allocateGPU(atom->Nclusters_max * sizeof(int)); + cuda_PBCx = (int *) allocateGPU(atom->Nclusters_max * sizeof(int)); + cuda_PBCy = (int *) allocateGPU(atom->Nclusters_max * sizeof(int)); + cuda_PBCz = (int *) allocateGPU(atom->Nclusters_max * sizeof(int)); + cuda_numneigh = (int *) allocateGPU(atom->Nclusters_max * sizeof(int)); + cuda_neighbors = (int *) allocateGPU(atom->Nclusters_max * neighbor->maxneighs * sizeof(int)); + natoms = (int *) malloc(atom->Nclusters_max); + ngatoms = (int *) malloc(atom->Nclusters_max); + isReneighboured = 1; +} + +extern "C" +void copyDataToCUDADevice(Atom *atom) { + memcpyToGPU(cuda_cl_x, atom->cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT)); + memcpyToGPU(cuda_cl_v, atom->cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT)); + memcpyToGPU(cuda_cl_f, atom->cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT)); + + for(int ci = 0; ci < atom->Nclusters_local; ci++) { + natoms[ci] = atom->iclusters[ci].natoms; + } + + memcpyToGPU(cuda_natoms, natoms, atom->Nclusters_local * sizeof(int)); + + int jfac = MAX(1, CLUSTER_N / CLUSTER_M); + int ncj = atom->Nclusters_local / jfac; + for(int cg = 0; cg < atom->Nclusters_ghost; cg++) { + const int cj = ncj + cg; + ngatoms[cg] = atom->jclusters[cj].natoms; + } + + memcpyToGPU(cuda_jclusters_natoms, ngatoms, atom->Nclusters_ghost * sizeof(int)); + memcpyToGPU(cuda_border_map, atom->border_map, atom->Nclusters_ghost * sizeof(int)); + memcpyToGPU(cuda_PBCx, atom->PBCx, atom->Nclusters_ghost * sizeof(int)); + memcpyToGPU(cuda_PBCy, atom->PBCy, atom->Nclusters_ghost * sizeof(int)); + memcpyToGPU(cuda_PBCz, atom->PBCz, atom->Nclusters_ghost * sizeof(int)); +} + +extern "C" +void copyDataFromCUDADevice(Atom *atom) { + memcpyFromGPU(atom->cl_x, cuda_cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT)); + memcpyFromGPU(atom->cl_v, cuda_cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT)); + memcpyFromGPU(atom->cl_f, cuda_cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT)); +} + +extern "C" +void cudaDeviceFree() { + cuda_assert("cudaDeviceFree", cudaFree(cuda_cl_x)); + cuda_assert("cudaDeviceFree", cudaFree(cuda_cl_v)); + cuda_assert("cudaDeviceFree", cudaFree(cuda_cl_f)); + cuda_assert("cudaDeviceFree", cudaFree(cuda_numneigh)); + cuda_assert("cudaDeviceFree", cudaFree(cuda_neighbors)); + cuda_assert("cudaDeviceFree", cudaFree(cuda_natoms)); + cuda_assert("cudaDeviceFree", cudaFree(cuda_border_map)); + cuda_assert("cudaDeviceFree", cudaFree(cuda_jclusters_natoms)); + cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCx)); + cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCy)); + cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCz)); + free(natoms); + free(ngatoms); +} + +__global__ void cudaInitialIntegrate_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f, + int *cuda_natoms, + int Nclusters_local, MD_FLOAT dtforce, MD_FLOAT dt) { + + unsigned int ci_pos = blockDim.x * blockIdx.x + threadIdx.x; + if (ci_pos >= Nclusters_local) return; + + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci_pos); + MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base]; + MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base]; + MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base]; + + for (int cii = 0; cii < cuda_natoms[ci_pos]; cii++) { + ci_v[CL_X_OFFSET + cii] += dtforce * ci_f[CL_X_OFFSET + cii]; + ci_v[CL_Y_OFFSET + cii] += dtforce * ci_f[CL_Y_OFFSET + cii]; + ci_v[CL_Z_OFFSET + cii] += dtforce * ci_f[CL_Z_OFFSET + cii]; + ci_x[CL_X_OFFSET + cii] += dt * ci_v[CL_X_OFFSET + cii]; + ci_x[CL_Y_OFFSET + cii] += dt * ci_v[CL_Y_OFFSET + cii]; + ci_x[CL_Z_OFFSET + cii] += dt * ci_v[CL_Z_OFFSET + cii]; + } +} + +__global__ void cudaUpdatePbc_warp(MD_FLOAT *cuda_cl_x, int *cuda_border_map, + int *cuda_jclusters_natoms, + int *cuda_PBCx, + int *cuda_PBCy, + int *cuda_PBCz, + int Nclusters_local, + int Nclusters_ghost, + MD_FLOAT param_xprd, + MD_FLOAT param_yprd, + MD_FLOAT param_zprd) { + unsigned int cg = blockDim.x * blockIdx.x + threadIdx.x; + if (cg >= Nclusters_ghost) return; + + int jfac = MAX(1, CLUSTER_N / CLUSTER_M); + int ncj = Nclusters_local / jfac; + MD_FLOAT xprd = param_xprd; + MD_FLOAT yprd = param_yprd; + MD_FLOAT zprd = param_zprd; + + const int cj = ncj + cg; + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + int bmap_vec_base = CJ_VECTOR_BASE_INDEX(cuda_border_map[cg]); + MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base]; + MD_FLOAT *bmap_x = &cuda_cl_x[bmap_vec_base]; + + for(int cjj = 0; cjj < cuda_jclusters_natoms[cg]; cjj++) { + cj_x[CL_X_OFFSET + cjj] = bmap_x[CL_X_OFFSET + cjj] + cuda_PBCx[cg] * xprd; + cj_x[CL_Y_OFFSET + cjj] = bmap_x[CL_Y_OFFSET + cjj] + cuda_PBCy[cg] * yprd; + cj_x[CL_Z_OFFSET + cjj] = bmap_x[CL_Z_OFFSET + cjj] + cuda_PBCz[cg] * zprd; + } +} + +__global__ void computeForceLJ_cuda_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_f, + int Nclusters_local, int Nclusters_max, + int *cuda_numneigh, int *cuda_neighs, int half_neigh, int maxneighs, + MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon) { + + unsigned int ci_pos = blockDim.x * blockIdx.x + threadIdx.x; + unsigned int cii_pos = blockDim.y * blockIdx.y + threadIdx.y; + unsigned int cjj_pos = blockDim.z * blockIdx.z + threadIdx.z; + if ((ci_pos >= Nclusters_local) || (cii_pos >= CLUSTER_M) || (cjj_pos >= CLUSTER_N)) return; + + int ci_cj0 = CJ0_FROM_CI(ci_pos); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci_pos); + MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base]; + MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base]; + int numneighs = cuda_numneigh[ci_pos]; + for(int k = 0; k < numneighs; k++) { + int cj = (&cuda_neighs[ci_pos * maxneighs])[k]; + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base]; + MD_FLOAT *cj_f = &cuda_cl_f[cj_vec_base]; + + MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii_pos]; + MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii_pos]; + MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii_pos]; + MD_FLOAT fix = 0; + MD_FLOAT fiy = 0; + MD_FLOAT fiz = 0; + + int cond; +#if CLUSTER_M == CLUSTER_N + cond = half_neigh ? (ci_cj0 != cj || cii_pos < cjj_pos) : + (ci_cj0 != cj || cii_pos != cjj_pos); +#elif CLUSTER_M < CLUSTER_N + cond = half_neigh ? (ci_cj0 != cj || cii_pos + CLUSTER_M * (ci_pos & 0x1) < cjj_pos) : + (ci_cj0 != cj || cii_pos + CLUSTER_M * (ci_pos & 0x1) != cjj_pos); +#endif + if(cond) { + MD_FLOAT delx = xtmp - cj_x[CL_X_OFFSET + cjj_pos]; + MD_FLOAT dely = ytmp - cj_x[CL_Y_OFFSET + cjj_pos]; + MD_FLOAT delz = ztmp - cj_x[CL_Z_OFFSET + cjj_pos]; + MD_FLOAT rsq = delx * delx + dely * dely + delz * delz; + if(rsq < cutforcesq) { + MD_FLOAT sr2 = 1.0 / rsq; + MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6; + MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon; + + if(half_neigh) { + atomicAdd(&cj_f[CL_X_OFFSET + cjj_pos], -delx * force); + atomicAdd(&cj_f[CL_Y_OFFSET + cjj_pos], -dely * force); + atomicAdd(&cj_f[CL_Z_OFFSET + cjj_pos], -delz * force); + } + + fix += delx * force; + fiy += dely * force; + fiz += delz * force; + + atomicAdd(&ci_f[CL_X_OFFSET + cii_pos], fix); + atomicAdd(&ci_f[CL_Y_OFFSET + cii_pos], fiy); + atomicAdd(&ci_f[CL_Z_OFFSET + cii_pos], fiz); + } + } + } +} + +__global__ void cudaFinalIntegrate_warp(MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f, + int *cuda_natoms, + int Nclusters_local, MD_FLOAT dtforce) { + + unsigned int ci_pos = blockDim.x * blockIdx.x + threadIdx.x; + if (ci_pos >= Nclusters_local) return; + + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci_pos); + MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base]; + MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base]; + + for (int cii = 0; cii < cuda_natoms[ci_pos]; cii++) { + ci_v[CL_X_OFFSET + cii] += dtforce * ci_f[CL_X_OFFSET + cii]; + ci_v[CL_Y_OFFSET + cii] += dtforce * ci_f[CL_Y_OFFSET + cii]; + ci_v[CL_Z_OFFSET + cii] += dtforce * ci_f[CL_Z_OFFSET + cii]; + } +} + +extern "C" +void cudaInitialIntegrate(Parameter *param, Atom *atom) { + const int threads_num = 16; + dim3 block_size = dim3(threads_num, 1, 1); + dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1); + cudaInitialIntegrate_warp<<>>(cuda_cl_x, cuda_cl_v, cuda_cl_f, + cuda_natoms, atom->Nclusters_local, param->dtforce, param->dt); + cuda_assert("cudaInitialIntegrate", cudaPeekAtLastError()); + cuda_assert("cudaInitialIntegrate", cudaDeviceSynchronize()); +} + +/* update coordinates of ghost atoms */ +/* uses mapping created in setupPbc */ +extern "C" +void cudaUpdatePbc(Atom *atom, Parameter *param) { + const int threads_num = 512; + dim3 block_size = dim3(threads_num, 1, 1);; + dim3 grid_size = dim3(atom->Nclusters_ghost/(threads_num)+1, 1, 1);; + cudaUpdatePbc_warp<<>>(cuda_cl_x, cuda_border_map, + cuda_jclusters_natoms, cuda_PBCx, cuda_PBCy, cuda_PBCz, + atom->Nclusters_local, atom->Nclusters_ghost, + param->xprd, param->yprd, param->zprd); + cuda_assert("cudaUpdatePbc", cudaPeekAtLastError()); + cuda_assert("cudaUpdatePbc", cudaDeviceSynchronize()); +} + +extern "C" +double computeForceLJ_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { + MD_FLOAT cutforcesq = param->cutforce * param->cutforce; + MD_FLOAT sigma6 = param->sigma6; + MD_FLOAT epsilon = param->epsilon; + + memsetGPU(cuda_cl_f, 0, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT)); + if (isReneighboured) { + for(int ci = 0; ci < atom->Nclusters_local; ci++) { + memcpyToGPU(&cuda_numneigh[ci], &neighbor->numneigh[ci], sizeof(int)); + memcpyToGPU(&cuda_neighbors[ci * neighbor->maxneighs], &neighbor->neighbors[ci * neighbor->maxneighs], neighbor->numneigh[ci] * sizeof(int)); + } + + isReneighboured = 0; + } + + const int threads_num = 1; + dim3 block_size = dim3(threads_num, CLUSTER_M, CLUSTER_N); + dim3 grid_size = dim3(atom->Nclusters_local/threads_num+1, 1, 1); + double S = getTimeStamp(); + LIKWID_MARKER_START("force"); + computeForceLJ_cuda_warp<<>>(cuda_cl_x, cuda_cl_f, + atom->Nclusters_local, atom->Nclusters_max, + cuda_numneigh, cuda_neighbors, + neighbor->half_neigh, neighbor->maxneighs, cutforcesq, + sigma6, epsilon); + cuda_assert("computeForceLJ_cuda", cudaPeekAtLastError()); + cuda_assert("computeForceLJ_cuda", cudaDeviceSynchronize()); + LIKWID_MARKER_STOP("force"); + double E = getTimeStamp(); + return E-S; +} + +extern "C" +void cudaFinalIntegrate(Parameter *param, Atom *atom) { + const int threads_num = 16; + dim3 block_size = dim3(threads_num, 1, 1); + dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1); + cudaFinalIntegrate_warp<<>>(cuda_cl_v, cuda_cl_f, cuda_natoms, atom->Nclusters_local, param->dt); + cuda_assert("cudaFinalIntegrate", cudaPeekAtLastError()); + cuda_assert("cudaFinalIntegrate", cudaDeviceSynchronize()); +} diff --git a/gromacs/includes/atom.h b/gromacs/includes/atom.h index 6716db2..8c4cb98 100644 --- a/gromacs/includes/atom.h +++ b/gromacs/includes/atom.h @@ -11,29 +11,43 @@ #define DELTA 20000 -#define CLUSTER_M 4 - // Nbnxn layouts (as of GROMACS): // Simd4xN: M=4, N=VECTOR_WIDTH // Simd2xNN: M=4, N=(VECTOR_WIDTH/2) +// Cuda: M=8, N=VECTOR_WIDTH -// Simd2xNN (here used for single-precision) -#if VECTOR_WIDTH > CLUSTER_M * 2 -# define KERNEL_NAME "Simd2xNN" -# define CLUSTER_N (VECTOR_WIDTH / 2) -# define computeForceLJ computeForceLJ_2xnn -// Simd4xN -#else -# define KERNEL_NAME "Simd4xN" +#ifdef CUDA_TARGET +# undef VECTOR_WIDTH +# define VECTOR_WIDTH 8 +# define KERNEL_NAME "CUDA" +# define CLUSTER_M 8 # define CLUSTER_N VECTOR_WIDTH -# define computeForceLJ computeForceLJ_4xn -#endif - -#ifdef USE_REFERENCE_VERSION -# undef KERNEL_NAME -# undef computeForceLJ -# define KERNEL_NAME "Reference" -# define computeForceLJ computeForceLJ_ref +# define computeForceLJ computeForceLJ_cuda +# define initialIntegrate cudaInitialIntegrate +# define finalIntegrate cudaFinalIntegrate +# define updatePbc cudaUpdatePbc +#else +# define CLUSTER_M 4 +// Simd2xNN (here used for single-precision) +# if VECTOR_WIDTH > CLUSTER_M * 2 +# define KERNEL_NAME "Simd2xNN" +# define CLUSTER_N (VECTOR_WIDTH / 2) +# define computeForceLJ computeForceLJ_2xnn +// Simd4xN +# else +# define KERNEL_NAME "Simd4xN" +# define CLUSTER_N VECTOR_WIDTH +# define computeForceLJ computeForceLJ_4xn +# endif +# ifdef USE_REFERENCE_VERSION +# undef KERNEL_NAME +# undef computeForceLJ +# define KERNEL_NAME "Reference" +# define computeForceLJ computeForceLJ_ref +# endif +# define initialIntegrate cpuInitialIntegrate +# define finalIntegrate cpuFinalIntegrate +# define updatePbc cpuUpdatePbc #endif #if CLUSTER_M == CLUSTER_N diff --git a/gromacs/includes/integrate.h b/gromacs/includes/integrate.h new file mode 100644 index 0000000..5236ed8 --- /dev/null +++ b/gromacs/includes/integrate.h @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg. + * All rights reserved. This file is part of MD-Bench. + * Use of this source code is governed by a LGPL-3.0 + * license that can be found in the LICENSE file. + */ +#include +//--- +#include +#include +#include + +void cpuInitialIntegrate(Parameter *param, Atom *atom) { + DEBUG_MESSAGE("cpuInitialIntegrate start\n"); + + for(int ci = 0; ci < atom->Nclusters_local; ci++) { + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base]; + MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; + + for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) { + ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii]; + ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii]; + ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii]; + ci_x[CL_X_OFFSET + cii] += param->dt * ci_v[CL_X_OFFSET + cii]; + ci_x[CL_Y_OFFSET + cii] += param->dt * ci_v[CL_Y_OFFSET + cii]; + ci_x[CL_Z_OFFSET + cii] += param->dt * ci_v[CL_Z_OFFSET + cii]; + } + } + + DEBUG_MESSAGE("cpuInitialIntegrate end\n"); +} + +void cpuFinalIntegrate(Parameter *param, Atom *atom) { + DEBUG_MESSAGE("cpuFinalIntegrate start\n"); + + for(int ci = 0; ci < atom->Nclusters_local; ci++) { + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base]; + MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; + + for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) { + ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii]; + ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii]; + ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii]; + } + } + + DEBUG_MESSAGE("cpuFinalIntegrate end\n"); +} + +#ifdef CUDA_TARGET +void cudaInitialIntegrate(Parameter*, Atom*); +void cudaFinalIntegrate(Parameter*, Atom*); +#endif diff --git a/gromacs/includes/pbc.h b/gromacs/includes/pbc.h index 4a04180..33ceb53 100644 --- a/gromacs/includes/pbc.h +++ b/gromacs/includes/pbc.h @@ -10,7 +10,11 @@ #ifndef __PBC_H_ #define __PBC_H_ extern void initPbc(); -extern void updatePbc(Atom*, Parameter*, int); +extern void cpuUpdatePbc(Atom*, Parameter*, int); extern void updateAtomsPbc(Atom*, Parameter*); extern void setupPbc(Atom*, Parameter*); + +#ifdef CUDA_TARGET +extern void cudaUpdatePbc(Atom*, Parameter*, int); +#endif #endif diff --git a/gromacs/main.c b/gromacs/main.c index b482992..e724348 100644 --- a/gromacs/main.c +++ b/gromacs/main.c @@ -9,19 +9,21 @@ //-- #include //-- -#include +#include #include +#include +#include +#include #include #include -#include +#include #include #include -#include #include -#include +#include +#include #include #include -#include #define HLINE "----------------------------------------------------------------------------\n" @@ -30,6 +32,14 @@ extern double computeForceLJ_4xn(Parameter*, Atom*, Neighbor*, Stats*); extern double computeForceLJ_2xnn(Parameter*, Atom*, Neighbor*, Stats*); extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*); +#ifdef CUDA_TARGET +extern int isReneighboured; +extern double computeForceLJ_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats); +extern void copyDataToCUDADevice(Atom *atom); +extern void copyDataFromCUDADevice(Atom *atom); +extern void cudaDeviceFree(); +#endif + double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) { if(param->force_field == FF_EAM) { initEam(eam, param); } double S, E; @@ -43,7 +53,6 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats * initPbc(atom); initStats(stats); initNeighbor(neighbor, param); - if(param->input_file == NULL) { createAtom(atom, param); } else { @@ -56,6 +65,7 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats * buildClusters(atom); defineJClusters(atom); setupPbc(atom, param); + initDevice(atom, neighbor); binClusters(atom); buildNeighbor(atom, neighbor); E = getTimeStamp(); @@ -78,46 +88,6 @@ double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) { return E-S; } -void initialIntegrate(Parameter *param, Atom *atom) { - DEBUG_MESSAGE("initialIntegrate start\n"); - - for(int ci = 0; ci < atom->Nclusters_local; ci++) { - int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); - MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; - MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base]; - MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; - - for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) { - ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii]; - ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii]; - ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii]; - ci_x[CL_X_OFFSET + cii] += param->dt * ci_v[CL_X_OFFSET + cii]; - ci_x[CL_Y_OFFSET + cii] += param->dt * ci_v[CL_Y_OFFSET + cii]; - ci_x[CL_Z_OFFSET + cii] += param->dt * ci_v[CL_Z_OFFSET + cii]; - } - } - - DEBUG_MESSAGE("initialIntegrate end\n"); -} - -void finalIntegrate(Parameter *param, Atom *atom) { - DEBUG_MESSAGE("finalIntegrate start\n"); - - for(int ci = 0; ci < atom->Nclusters_local; ci++) { - int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); - MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base]; - MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base]; - - for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) { - ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii]; - ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii]; - ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii]; - } - } - - DEBUG_MESSAGE("finalIntegrate end\n"); -} - void printAtomState(Atom *atom) { printf("Atom counts: Natoms=%d Nlocal=%d Nghost=%d Nmax=%d\n", atom->Natoms, atom->Nlocal, atom->Nghost, atom->Nmax); @@ -244,6 +214,11 @@ int main(int argc, char** argv) { #if defined(MEM_TRACER) || defined(INDEX_TRACER) traceAddresses(¶m, &atom, &neighbor, n + 1); #endif + + #ifdef CUDA_TARGET + copyDataToCUDADevice(&atom); + #endif + if(param.force_field == FF_EAM) { timer[FORCE] = computeForceEam(&eam, ¶m, &atom, &neighbor, &stats); } else { @@ -271,7 +246,16 @@ int main(int argc, char** argv) { updatePbc(&atom, ¶m, 0); } else { + #ifdef CUDA_TARGET + copyDataFromCUDADevice(&atom); + #endif + timer[NEIGH] += reneighbour(¶m, &atom, &neighbor); + + #ifdef CUDA_TARGET + copyDataToCUDADevice(&atom); + isReneighboured = 1; + #endif } #if defined(MEM_TRACER) || defined(INDEX_TRACER) @@ -303,6 +287,10 @@ int main(int argc, char** argv) { } } + #ifdef CUDA_TARGET + copyDataFromCUDADevice(&atom); + #endif + timer[TOTAL] = getTimeStamp() - timer[TOTAL]; updateSingleAtoms(&atom); computeThermo(-1, ¶m, &atom); @@ -311,6 +299,10 @@ int main(int argc, char** argv) { xtc_end(); } + #ifdef CUDA_TARGET + cudaDeviceFree(); + #endif + printf(HLINE); printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes); printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n", diff --git a/gromacs/pbc.c b/gromacs/pbc.c index 8a484c5..d5320fa 100644 --- a/gromacs/pbc.c +++ b/gromacs/pbc.c @@ -29,7 +29,7 @@ void initPbc(Atom* atom) { /* update coordinates of ghost atoms */ /* uses mapping created in setupPbc */ -void updatePbc(Atom *atom, Parameter *param, int firstUpdate) { +void cpuUpdatePbc(Atom *atom, Parameter *param, int firstUpdate) { DEBUG_MESSAGE("updatePbc start\n"); int jfac = MAX(1, CLUSTER_N / CLUSTER_M); int ncj = atom->Nclusters_local / jfac; diff --git a/lammps/device_spec.c b/lammps/device_spec.c new file mode 100644 index 0000000..b6fec89 --- /dev/null +++ b/lammps/device_spec.c @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg. + * All rights reserved. This file is part of MD-Bench. + * Use of this source code is governed by a LGPL-3.0 + * license that can be found in the LICENSE file. + */ +#include + +#ifdef CUDA_TARGET + +void initDevice(Atom *atom, Neighbor *neighbor) { + DeviceAtom *d_atom = &(atom->d_atom); + DeviceNeighbor *d_neighbor = &(neighbor->d_neighbor); + + d_atom->epsilon = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes); + d_atom->sigma6 = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes); + d_atom->cutforcesq = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes); + d_neighbor->neighbors = (int *) allocateGPU(sizeof(int) * atom->Nmax * neighbor->maxneighs); + d_neighbor->numneigh = (int *) allocateGPU(sizeof(int) * atom->Nmax); + + memcpyToGPU(d_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3); + memcpyToGPU(d_atom->vx, atom->vx, sizeof(MD_FLOAT) * atom->Nmax * 3); + memcpyToGPU(d_atom->sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes); + memcpyToGPU(d_atom->epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes); + memcpyToGPU(d_atom->cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes); + memcpyToGPU(d_atom->type, atom->type, sizeof(int) * atom->Nmax); +} + +#endif