Integrate GROMACS GPU implementation into master branch
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
This commit is contained in:
parent
493915fe95
commit
c70ebce4c1
@ -58,24 +58,6 @@ void memsetGPU(void *d_ptr, int value, size_t bytesize) {
|
||||
cuda_assert("memsetGPU", cudaMemset(d_ptr, value, bytesize));
|
||||
}
|
||||
|
||||
void initDevice(Atom *atom, Neighbor *neighbor) {
|
||||
DeviceAtom *d_atom = &(atom->d_atom);
|
||||
DeviceNeighbor *d_neighbor = &(neighbor->d_neighbor);
|
||||
|
||||
d_atom->epsilon = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
d_atom->sigma6 = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
d_atom->cutforcesq = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
d_neighbor->neighbors = (int *) allocateGPU(sizeof(int) * atom->Nmax * neighbor->maxneighs);
|
||||
d_neighbor->numneigh = (int *) allocateGPU(sizeof(int) * atom->Nmax);
|
||||
|
||||
memcpyToGPU(d_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3);
|
||||
memcpyToGPU(d_atom->vx, atom->vx, sizeof(MD_FLOAT) * atom->Nmax * 3);
|
||||
memcpyToGPU(d_atom->sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
memcpyToGPU(d_atom->epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
memcpyToGPU(d_atom->cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
memcpyToGPU(d_atom->type, atom->type, sizeof(int) * atom->Nmax);
|
||||
}
|
||||
|
||||
#else
|
||||
void initDevice(Atom *atom, Neighbor *neighbor) {}
|
||||
void *allocateGPU(size_t bytesize) { return NULL; }
|
@ -4,6 +4,8 @@
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stddef.h>
|
||||
//---
|
||||
#include <atom.h>
|
||||
#include <neighbor.h>
|
||||
|
317
gromacs/cuda/force_lj.cu
Normal file
317
gromacs/cuda/force_lj.cu
Normal file
@ -0,0 +1,317 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
extern "C" {
|
||||
|
||||
#include <stdio.h>
|
||||
//---
|
||||
#include <cuda.h>
|
||||
#include <driver_types.h>
|
||||
//---
|
||||
#include <likwid-marker.h>
|
||||
//---
|
||||
#include <atom.h>
|
||||
#include <device.h>
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <stats.h>
|
||||
#include <timing.h>
|
||||
#include <util.h>
|
||||
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
MD_FLOAT *cuda_cl_x;
|
||||
MD_FLOAT *cuda_cl_v;
|
||||
MD_FLOAT *cuda_cl_f;
|
||||
int *cuda_neighbors;
|
||||
int *cuda_numneigh;
|
||||
int *cuda_natoms;
|
||||
int *natoms;
|
||||
int *ngatoms;
|
||||
int *cuda_border_map;
|
||||
int *cuda_jclusters_natoms;
|
||||
MD_FLOAT *cuda_bbminx, *cuda_bbmaxx;
|
||||
MD_FLOAT *cuda_bbminy, *cuda_bbmaxy;
|
||||
MD_FLOAT *cuda_bbminz, *cuda_bbmaxz;
|
||||
int *cuda_PBCx, *cuda_PBCy, *cuda_PBCz;
|
||||
int isReneighboured;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
void initDevice(Atom *atom, Neighbor *neighbor) {
|
||||
cuda_assert("cudaDeviceSetup", cudaDeviceReset());
|
||||
cuda_assert("cudaDeviceSetup", cudaSetDevice(0));
|
||||
cuda_cl_x = (MD_FLOAT *) allocateGPU(atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
cuda_cl_v = (MD_FLOAT *) allocateGPU(atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
cuda_cl_f = (MD_FLOAT *) allocateGPU(atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
cuda_natoms = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
|
||||
cuda_jclusters_natoms = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
|
||||
cuda_border_map = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
|
||||
cuda_PBCx = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
|
||||
cuda_PBCy = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
|
||||
cuda_PBCz = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
|
||||
cuda_numneigh = (int *) allocateGPU(atom->Nclusters_max * sizeof(int));
|
||||
cuda_neighbors = (int *) allocateGPU(atom->Nclusters_max * neighbor->maxneighs * sizeof(int));
|
||||
natoms = (int *) malloc(atom->Nclusters_max);
|
||||
ngatoms = (int *) malloc(atom->Nclusters_max);
|
||||
isReneighboured = 1;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
void copyDataToCUDADevice(Atom *atom) {
|
||||
memcpyToGPU(cuda_cl_x, atom->cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
memcpyToGPU(cuda_cl_v, atom->cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
memcpyToGPU(cuda_cl_f, atom->cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
natoms[ci] = atom->iclusters[ci].natoms;
|
||||
}
|
||||
|
||||
memcpyToGPU(cuda_natoms, natoms, atom->Nclusters_local * sizeof(int));
|
||||
|
||||
int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
|
||||
int ncj = atom->Nclusters_local / jfac;
|
||||
for(int cg = 0; cg < atom->Nclusters_ghost; cg++) {
|
||||
const int cj = ncj + cg;
|
||||
ngatoms[cg] = atom->jclusters[cj].natoms;
|
||||
}
|
||||
|
||||
memcpyToGPU(cuda_jclusters_natoms, ngatoms, atom->Nclusters_ghost * sizeof(int));
|
||||
memcpyToGPU(cuda_border_map, atom->border_map, atom->Nclusters_ghost * sizeof(int));
|
||||
memcpyToGPU(cuda_PBCx, atom->PBCx, atom->Nclusters_ghost * sizeof(int));
|
||||
memcpyToGPU(cuda_PBCy, atom->PBCy, atom->Nclusters_ghost * sizeof(int));
|
||||
memcpyToGPU(cuda_PBCz, atom->PBCz, atom->Nclusters_ghost * sizeof(int));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
void copyDataFromCUDADevice(Atom *atom) {
|
||||
memcpyFromGPU(atom->cl_x, cuda_cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
memcpyFromGPU(atom->cl_v, cuda_cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
memcpyFromGPU(atom->cl_f, cuda_cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
}
|
||||
|
||||
extern "C"
|
||||
void cudaDeviceFree() {
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_cl_x));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_cl_v));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_cl_f));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_numneigh));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_neighbors));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_natoms));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_border_map));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_jclusters_natoms));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCx));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCy));
|
||||
cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCz));
|
||||
free(natoms);
|
||||
free(ngatoms);
|
||||
}
|
||||
|
||||
__global__ void cudaInitialIntegrate_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
|
||||
int *cuda_natoms,
|
||||
int Nclusters_local, MD_FLOAT dtforce, MD_FLOAT dt) {
|
||||
|
||||
unsigned int ci_pos = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (ci_pos >= Nclusters_local) return;
|
||||
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci_pos);
|
||||
MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];
|
||||
MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];
|
||||
MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
|
||||
|
||||
for (int cii = 0; cii < cuda_natoms[ci_pos]; cii++) {
|
||||
ci_v[CL_X_OFFSET + cii] += dtforce * ci_f[CL_X_OFFSET + cii];
|
||||
ci_v[CL_Y_OFFSET + cii] += dtforce * ci_f[CL_Y_OFFSET + cii];
|
||||
ci_v[CL_Z_OFFSET + cii] += dtforce * ci_f[CL_Z_OFFSET + cii];
|
||||
ci_x[CL_X_OFFSET + cii] += dt * ci_v[CL_X_OFFSET + cii];
|
||||
ci_x[CL_Y_OFFSET + cii] += dt * ci_v[CL_Y_OFFSET + cii];
|
||||
ci_x[CL_Z_OFFSET + cii] += dt * ci_v[CL_Z_OFFSET + cii];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void cudaUpdatePbc_warp(MD_FLOAT *cuda_cl_x, int *cuda_border_map,
|
||||
int *cuda_jclusters_natoms,
|
||||
int *cuda_PBCx,
|
||||
int *cuda_PBCy,
|
||||
int *cuda_PBCz,
|
||||
int Nclusters_local,
|
||||
int Nclusters_ghost,
|
||||
MD_FLOAT param_xprd,
|
||||
MD_FLOAT param_yprd,
|
||||
MD_FLOAT param_zprd) {
|
||||
unsigned int cg = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (cg >= Nclusters_ghost) return;
|
||||
|
||||
int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
|
||||
int ncj = Nclusters_local / jfac;
|
||||
MD_FLOAT xprd = param_xprd;
|
||||
MD_FLOAT yprd = param_yprd;
|
||||
MD_FLOAT zprd = param_zprd;
|
||||
|
||||
const int cj = ncj + cg;
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
int bmap_vec_base = CJ_VECTOR_BASE_INDEX(cuda_border_map[cg]);
|
||||
MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];
|
||||
MD_FLOAT *bmap_x = &cuda_cl_x[bmap_vec_base];
|
||||
|
||||
for(int cjj = 0; cjj < cuda_jclusters_natoms[cg]; cjj++) {
|
||||
cj_x[CL_X_OFFSET + cjj] = bmap_x[CL_X_OFFSET + cjj] + cuda_PBCx[cg] * xprd;
|
||||
cj_x[CL_Y_OFFSET + cjj] = bmap_x[CL_Y_OFFSET + cjj] + cuda_PBCy[cg] * yprd;
|
||||
cj_x[CL_Z_OFFSET + cjj] = bmap_x[CL_Z_OFFSET + cjj] + cuda_PBCz[cg] * zprd;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void computeForceLJ_cuda_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_f,
|
||||
int Nclusters_local, int Nclusters_max,
|
||||
int *cuda_numneigh, int *cuda_neighs, int half_neigh, int maxneighs,
|
||||
MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon) {
|
||||
|
||||
unsigned int ci_pos = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
unsigned int cii_pos = blockDim.y * blockIdx.y + threadIdx.y;
|
||||
unsigned int cjj_pos = blockDim.z * blockIdx.z + threadIdx.z;
|
||||
if ((ci_pos >= Nclusters_local) || (cii_pos >= CLUSTER_M) || (cjj_pos >= CLUSTER_N)) return;
|
||||
|
||||
int ci_cj0 = CJ0_FROM_CI(ci_pos);
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci_pos);
|
||||
MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];
|
||||
MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
|
||||
int numneighs = cuda_numneigh[ci_pos];
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int cj = (&cuda_neighs[ci_pos * maxneighs])[k];
|
||||
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
|
||||
MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];
|
||||
MD_FLOAT *cj_f = &cuda_cl_f[cj_vec_base];
|
||||
|
||||
MD_FLOAT xtmp = ci_x[CL_X_OFFSET + cii_pos];
|
||||
MD_FLOAT ytmp = ci_x[CL_Y_OFFSET + cii_pos];
|
||||
MD_FLOAT ztmp = ci_x[CL_Z_OFFSET + cii_pos];
|
||||
MD_FLOAT fix = 0;
|
||||
MD_FLOAT fiy = 0;
|
||||
MD_FLOAT fiz = 0;
|
||||
|
||||
int cond;
|
||||
#if CLUSTER_M == CLUSTER_N
|
||||
cond = half_neigh ? (ci_cj0 != cj || cii_pos < cjj_pos) :
|
||||
(ci_cj0 != cj || cii_pos != cjj_pos);
|
||||
#elif CLUSTER_M < CLUSTER_N
|
||||
cond = half_neigh ? (ci_cj0 != cj || cii_pos + CLUSTER_M * (ci_pos & 0x1) < cjj_pos) :
|
||||
(ci_cj0 != cj || cii_pos + CLUSTER_M * (ci_pos & 0x1) != cjj_pos);
|
||||
#endif
|
||||
if(cond) {
|
||||
MD_FLOAT delx = xtmp - cj_x[CL_X_OFFSET + cjj_pos];
|
||||
MD_FLOAT dely = ytmp - cj_x[CL_Y_OFFSET + cjj_pos];
|
||||
MD_FLOAT delz = ztmp - cj_x[CL_Z_OFFSET + cjj_pos];
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
if(rsq < cutforcesq) {
|
||||
MD_FLOAT sr2 = 1.0 / rsq;
|
||||
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
|
||||
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
|
||||
|
||||
if(half_neigh) {
|
||||
atomicAdd(&cj_f[CL_X_OFFSET + cjj_pos], -delx * force);
|
||||
atomicAdd(&cj_f[CL_Y_OFFSET + cjj_pos], -dely * force);
|
||||
atomicAdd(&cj_f[CL_Z_OFFSET + cjj_pos], -delz * force);
|
||||
}
|
||||
|
||||
fix += delx * force;
|
||||
fiy += dely * force;
|
||||
fiz += delz * force;
|
||||
|
||||
atomicAdd(&ci_f[CL_X_OFFSET + cii_pos], fix);
|
||||
atomicAdd(&ci_f[CL_Y_OFFSET + cii_pos], fiy);
|
||||
atomicAdd(&ci_f[CL_Z_OFFSET + cii_pos], fiz);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void cudaFinalIntegrate_warp(MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
|
||||
int *cuda_natoms,
|
||||
int Nclusters_local, MD_FLOAT dtforce) {
|
||||
|
||||
unsigned int ci_pos = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (ci_pos >= Nclusters_local) return;
|
||||
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci_pos);
|
||||
MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];
|
||||
MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
|
||||
|
||||
for (int cii = 0; cii < cuda_natoms[ci_pos]; cii++) {
|
||||
ci_v[CL_X_OFFSET + cii] += dtforce * ci_f[CL_X_OFFSET + cii];
|
||||
ci_v[CL_Y_OFFSET + cii] += dtforce * ci_f[CL_Y_OFFSET + cii];
|
||||
ci_v[CL_Z_OFFSET + cii] += dtforce * ci_f[CL_Z_OFFSET + cii];
|
||||
}
|
||||
}
|
||||
|
||||
extern "C"
|
||||
void cudaInitialIntegrate(Parameter *param, Atom *atom) {
|
||||
const int threads_num = 16;
|
||||
dim3 block_size = dim3(threads_num, 1, 1);
|
||||
dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1);
|
||||
cudaInitialIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_cl_v, cuda_cl_f,
|
||||
cuda_natoms, atom->Nclusters_local, param->dtforce, param->dt);
|
||||
cuda_assert("cudaInitialIntegrate", cudaPeekAtLastError());
|
||||
cuda_assert("cudaInitialIntegrate", cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
/* update coordinates of ghost atoms */
|
||||
/* uses mapping created in setupPbc */
|
||||
extern "C"
|
||||
void cudaUpdatePbc(Atom *atom, Parameter *param) {
|
||||
const int threads_num = 512;
|
||||
dim3 block_size = dim3(threads_num, 1, 1);;
|
||||
dim3 grid_size = dim3(atom->Nclusters_ghost/(threads_num)+1, 1, 1);;
|
||||
cudaUpdatePbc_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_border_map,
|
||||
cuda_jclusters_natoms, cuda_PBCx, cuda_PBCy, cuda_PBCz,
|
||||
atom->Nclusters_local, atom->Nclusters_ghost,
|
||||
param->xprd, param->yprd, param->zprd);
|
||||
cuda_assert("cudaUpdatePbc", cudaPeekAtLastError());
|
||||
cuda_assert("cudaUpdatePbc", cudaDeviceSynchronize());
|
||||
}
|
||||
|
||||
extern "C"
|
||||
double computeForceLJ_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
|
||||
memsetGPU(cuda_cl_f, 0, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
|
||||
if (isReneighboured) {
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
memcpyToGPU(&cuda_numneigh[ci], &neighbor->numneigh[ci], sizeof(int));
|
||||
memcpyToGPU(&cuda_neighbors[ci * neighbor->maxneighs], &neighbor->neighbors[ci * neighbor->maxneighs], neighbor->numneigh[ci] * sizeof(int));
|
||||
}
|
||||
|
||||
isReneighboured = 0;
|
||||
}
|
||||
|
||||
const int threads_num = 1;
|
||||
dim3 block_size = dim3(threads_num, CLUSTER_M, CLUSTER_N);
|
||||
dim3 grid_size = dim3(atom->Nclusters_local/threads_num+1, 1, 1);
|
||||
double S = getTimeStamp();
|
||||
LIKWID_MARKER_START("force");
|
||||
computeForceLJ_cuda_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_cl_f,
|
||||
atom->Nclusters_local, atom->Nclusters_max,
|
||||
cuda_numneigh, cuda_neighbors,
|
||||
neighbor->half_neigh, neighbor->maxneighs, cutforcesq,
|
||||
sigma6, epsilon);
|
||||
cuda_assert("computeForceLJ_cuda", cudaPeekAtLastError());
|
||||
cuda_assert("computeForceLJ_cuda", cudaDeviceSynchronize());
|
||||
LIKWID_MARKER_STOP("force");
|
||||
double E = getTimeStamp();
|
||||
return E-S;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
void cudaFinalIntegrate(Parameter *param, Atom *atom) {
|
||||
const int threads_num = 16;
|
||||
dim3 block_size = dim3(threads_num, 1, 1);
|
||||
dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1);
|
||||
cudaFinalIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_v, cuda_cl_f, cuda_natoms, atom->Nclusters_local, param->dt);
|
||||
cuda_assert("cudaFinalIntegrate", cudaPeekAtLastError());
|
||||
cuda_assert("cudaFinalIntegrate", cudaDeviceSynchronize());
|
||||
}
|
@ -11,29 +11,43 @@
|
||||
|
||||
#define DELTA 20000
|
||||
|
||||
#define CLUSTER_M 4
|
||||
|
||||
// Nbnxn layouts (as of GROMACS):
|
||||
// Simd4xN: M=4, N=VECTOR_WIDTH
|
||||
// Simd2xNN: M=4, N=(VECTOR_WIDTH/2)
|
||||
// Cuda: M=8, N=VECTOR_WIDTH
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
# undef VECTOR_WIDTH
|
||||
# define VECTOR_WIDTH 8
|
||||
# define KERNEL_NAME "CUDA"
|
||||
# define CLUSTER_M 8
|
||||
# define CLUSTER_N VECTOR_WIDTH
|
||||
# define computeForceLJ computeForceLJ_cuda
|
||||
# define initialIntegrate cudaInitialIntegrate
|
||||
# define finalIntegrate cudaFinalIntegrate
|
||||
# define updatePbc cudaUpdatePbc
|
||||
#else
|
||||
# define CLUSTER_M 4
|
||||
// Simd2xNN (here used for single-precision)
|
||||
#if VECTOR_WIDTH > CLUSTER_M * 2
|
||||
# if VECTOR_WIDTH > CLUSTER_M * 2
|
||||
# define KERNEL_NAME "Simd2xNN"
|
||||
# define CLUSTER_N (VECTOR_WIDTH / 2)
|
||||
# define computeForceLJ computeForceLJ_2xnn
|
||||
// Simd4xN
|
||||
#else
|
||||
# else
|
||||
# define KERNEL_NAME "Simd4xN"
|
||||
# define CLUSTER_N VECTOR_WIDTH
|
||||
# define computeForceLJ computeForceLJ_4xn
|
||||
#endif
|
||||
|
||||
#ifdef USE_REFERENCE_VERSION
|
||||
# endif
|
||||
# ifdef USE_REFERENCE_VERSION
|
||||
# undef KERNEL_NAME
|
||||
# undef computeForceLJ
|
||||
# define KERNEL_NAME "Reference"
|
||||
# define computeForceLJ computeForceLJ_ref
|
||||
# endif
|
||||
# define initialIntegrate cpuInitialIntegrate
|
||||
# define finalIntegrate cpuFinalIntegrate
|
||||
# define updatePbc cpuUpdatePbc
|
||||
#endif
|
||||
|
||||
#if CLUSTER_M == CLUSTER_N
|
||||
|
56
gromacs/includes/integrate.h
Normal file
56
gromacs/includes/integrate.h
Normal file
@ -0,0 +1,56 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <stdbool.h>
|
||||
//---
|
||||
#include <atom.h>
|
||||
#include <parameter.h>
|
||||
#include <util.h>
|
||||
|
||||
void cpuInitialIntegrate(Parameter *param, Atom *atom) {
|
||||
DEBUG_MESSAGE("cpuInitialIntegrate start\n");
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
|
||||
ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii];
|
||||
ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii];
|
||||
ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii];
|
||||
ci_x[CL_X_OFFSET + cii] += param->dt * ci_v[CL_X_OFFSET + cii];
|
||||
ci_x[CL_Y_OFFSET + cii] += param->dt * ci_v[CL_Y_OFFSET + cii];
|
||||
ci_x[CL_Z_OFFSET + cii] += param->dt * ci_v[CL_Z_OFFSET + cii];
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("cpuInitialIntegrate end\n");
|
||||
}
|
||||
|
||||
void cpuFinalIntegrate(Parameter *param, Atom *atom) {
|
||||
DEBUG_MESSAGE("cpuFinalIntegrate start\n");
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
|
||||
ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii];
|
||||
ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii];
|
||||
ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii];
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("cpuFinalIntegrate end\n");
|
||||
}
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
void cudaInitialIntegrate(Parameter*, Atom*);
|
||||
void cudaFinalIntegrate(Parameter*, Atom*);
|
||||
#endif
|
@ -10,7 +10,11 @@
|
||||
#ifndef __PBC_H_
|
||||
#define __PBC_H_
|
||||
extern void initPbc();
|
||||
extern void updatePbc(Atom*, Parameter*, int);
|
||||
extern void cpuUpdatePbc(Atom*, Parameter*, int);
|
||||
extern void updateAtomsPbc(Atom*, Parameter*);
|
||||
extern void setupPbc(Atom*, Parameter*);
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
extern void cudaUpdatePbc(Atom*, Parameter*, int);
|
||||
#endif
|
||||
#endif
|
||||
|
@ -9,19 +9,21 @@
|
||||
//--
|
||||
#include <likwid-marker.h>
|
||||
//--
|
||||
#include <timing.h>
|
||||
#include <atom.h>
|
||||
#include <allocate.h>
|
||||
#include <device.h>
|
||||
#include <eam.h>
|
||||
#include <integrate.h>
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <atom.h>
|
||||
#include <pbc.h>
|
||||
#include <stats.h>
|
||||
#include <thermo.h>
|
||||
#include <pbc.h>
|
||||
#include <timers.h>
|
||||
#include <eam.h>
|
||||
#include <timing.h>
|
||||
#include <util.h>
|
||||
#include <vtk.h>
|
||||
#include <xtc.h>
|
||||
#include <util.h>
|
||||
|
||||
#define HLINE "----------------------------------------------------------------------------\n"
|
||||
|
||||
@ -30,6 +32,14 @@ extern double computeForceLJ_4xn(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceLJ_2xnn(Parameter*, Atom*, Neighbor*, Stats*);
|
||||
extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
extern int isReneighboured;
|
||||
extern double computeForceLJ_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats);
|
||||
extern void copyDataToCUDADevice(Atom *atom);
|
||||
extern void copyDataFromCUDADevice(Atom *atom);
|
||||
extern void cudaDeviceFree();
|
||||
#endif
|
||||
|
||||
double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
||||
if(param->force_field == FF_EAM) { initEam(eam, param); }
|
||||
double S, E;
|
||||
@ -43,7 +53,6 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *
|
||||
initPbc(atom);
|
||||
initStats(stats);
|
||||
initNeighbor(neighbor, param);
|
||||
|
||||
if(param->input_file == NULL) {
|
||||
createAtom(atom, param);
|
||||
} else {
|
||||
@ -56,6 +65,7 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *
|
||||
buildClusters(atom);
|
||||
defineJClusters(atom);
|
||||
setupPbc(atom, param);
|
||||
initDevice(atom, neighbor);
|
||||
binClusters(atom);
|
||||
buildNeighbor(atom, neighbor);
|
||||
E = getTimeStamp();
|
||||
@ -78,46 +88,6 @@ double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
||||
return E-S;
|
||||
}
|
||||
|
||||
void initialIntegrate(Parameter *param, Atom *atom) {
|
||||
DEBUG_MESSAGE("initialIntegrate start\n");
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
|
||||
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
|
||||
ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii];
|
||||
ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii];
|
||||
ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii];
|
||||
ci_x[CL_X_OFFSET + cii] += param->dt * ci_v[CL_X_OFFSET + cii];
|
||||
ci_x[CL_Y_OFFSET + cii] += param->dt * ci_v[CL_Y_OFFSET + cii];
|
||||
ci_x[CL_Z_OFFSET + cii] += param->dt * ci_v[CL_Z_OFFSET + cii];
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("initialIntegrate end\n");
|
||||
}
|
||||
|
||||
void finalIntegrate(Parameter *param, Atom *atom) {
|
||||
DEBUG_MESSAGE("finalIntegrate start\n");
|
||||
|
||||
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
|
||||
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
|
||||
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
|
||||
MD_FLOAT *ci_f = &atom->cl_f[ci_vec_base];
|
||||
|
||||
for(int cii = 0; cii < atom->iclusters[ci].natoms; cii++) {
|
||||
ci_v[CL_X_OFFSET + cii] += param->dtforce * ci_f[CL_X_OFFSET + cii];
|
||||
ci_v[CL_Y_OFFSET + cii] += param->dtforce * ci_f[CL_Y_OFFSET + cii];
|
||||
ci_v[CL_Z_OFFSET + cii] += param->dtforce * ci_f[CL_Z_OFFSET + cii];
|
||||
}
|
||||
}
|
||||
|
||||
DEBUG_MESSAGE("finalIntegrate end\n");
|
||||
}
|
||||
|
||||
void printAtomState(Atom *atom) {
|
||||
printf("Atom counts: Natoms=%d Nlocal=%d Nghost=%d Nmax=%d\n",
|
||||
atom->Natoms, atom->Nlocal, atom->Nghost, atom->Nmax);
|
||||
@ -244,6 +214,11 @@ int main(int argc, char** argv) {
|
||||
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
|
||||
traceAddresses(¶m, &atom, &neighbor, n + 1);
|
||||
#endif
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
copyDataToCUDADevice(&atom);
|
||||
#endif
|
||||
|
||||
if(param.force_field == FF_EAM) {
|
||||
timer[FORCE] = computeForceEam(&eam, ¶m, &atom, &neighbor, &stats);
|
||||
} else {
|
||||
@ -271,7 +246,16 @@ int main(int argc, char** argv) {
|
||||
|
||||
updatePbc(&atom, ¶m, 0);
|
||||
} else {
|
||||
#ifdef CUDA_TARGET
|
||||
copyDataFromCUDADevice(&atom);
|
||||
#endif
|
||||
|
||||
timer[NEIGH] += reneighbour(¶m, &atom, &neighbor);
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
copyDataToCUDADevice(&atom);
|
||||
isReneighboured = 1;
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
|
||||
@ -303,6 +287,10 @@ int main(int argc, char** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
copyDataFromCUDADevice(&atom);
|
||||
#endif
|
||||
|
||||
timer[TOTAL] = getTimeStamp() - timer[TOTAL];
|
||||
updateSingleAtoms(&atom);
|
||||
computeThermo(-1, ¶m, &atom);
|
||||
@ -311,6 +299,10 @@ int main(int argc, char** argv) {
|
||||
xtc_end();
|
||||
}
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
cudaDeviceFree();
|
||||
#endif
|
||||
|
||||
printf(HLINE);
|
||||
printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes);
|
||||
printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
|
||||
|
@ -29,7 +29,7 @@ void initPbc(Atom* atom) {
|
||||
|
||||
/* update coordinates of ghost atoms */
|
||||
/* uses mapping created in setupPbc */
|
||||
void updatePbc(Atom *atom, Parameter *param, int firstUpdate) {
|
||||
void cpuUpdatePbc(Atom *atom, Parameter *param, int firstUpdate) {
|
||||
DEBUG_MESSAGE("updatePbc start\n");
|
||||
int jfac = MAX(1, CLUSTER_N / CLUSTER_M);
|
||||
int ncj = atom->Nclusters_local / jfac;
|
||||
|
29
lammps/device_spec.c
Normal file
29
lammps/device_spec.c
Normal file
@ -0,0 +1,29 @@
|
||||
/*
|
||||
* Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of MD-Bench.
|
||||
* Use of this source code is governed by a LGPL-3.0
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <device.h>
|
||||
|
||||
#ifdef CUDA_TARGET
|
||||
|
||||
void initDevice(Atom *atom, Neighbor *neighbor) {
|
||||
DeviceAtom *d_atom = &(atom->d_atom);
|
||||
DeviceNeighbor *d_neighbor = &(neighbor->d_neighbor);
|
||||
|
||||
d_atom->epsilon = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
d_atom->sigma6 = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
d_atom->cutforcesq = (MD_FLOAT *) allocateGPU(sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
d_neighbor->neighbors = (int *) allocateGPU(sizeof(int) * atom->Nmax * neighbor->maxneighs);
|
||||
d_neighbor->numneigh = (int *) allocateGPU(sizeof(int) * atom->Nmax);
|
||||
|
||||
memcpyToGPU(d_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3);
|
||||
memcpyToGPU(d_atom->vx, atom->vx, sizeof(MD_FLOAT) * atom->Nmax * 3);
|
||||
memcpyToGPU(d_atom->sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
memcpyToGPU(d_atom->epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
memcpyToGPU(d_atom->cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes);
|
||||
memcpyToGPU(d_atom->type, atom->type, sizeof(int) * atom->Nmax);
|
||||
}
|
||||
|
||||
#endif
|
Loading…
Reference in New Issue
Block a user