diff --git a/Makefile b/Makefile index c82abdf..668461a 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,7 @@ TARGET = MDBench-$(TAG)-$(OPT_SCHEME) BUILD_DIR = ./$(TAG)-$(OPT_SCHEME) SRC_DIR = ./$(OPT_SCHEME) ASM_DIR = ./asm +CUDA_DIR = ./$(SRC_DIR)/cuda MAKE_DIR = ./ Q ?= @ @@ -15,12 +16,12 @@ include $(MAKE_DIR)/include_GROMACS.mk INCLUDES += -I./$(SRC_DIR)/includes ifeq ($(strip $(DATA_LAYOUT)),AOS) -DEFINES += -DAOS + DEFINES += -DAOS endif ifeq ($(strip $(DATA_TYPE)),SP) -DEFINES += -DPRECISION=1 + DEFINES += -DPRECISION=1 else -DEFINES += -DPRECISION=2 + DEFINES += -DPRECISION=2 endif ifneq ($(ASM_SYNTAX), ATT) @@ -79,11 +80,14 @@ ifeq ($(strip $(USE_SIMD_KERNEL)),true) DEFINES += -DUSE_SIMD_KERNEL endif -VPATH = $(SRC_DIR) $(ASM_DIR) +VPATH = $(SRC_DIR) $(ASM_DIR) $(CUDA_DIR) ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c)) OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s)) OBJ = $(filter-out $(BUILD_DIR)/main% $(OVERWRITE),$(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c))) OBJ += $(patsubst $(ASM_DIR)/%.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*.s)) +ifeq ($(strip $(TAG)),NVCC) +OBJ += $(patsubst $(CUDA_DIR)/%.cu, $(BUILD_DIR)/%-cuda.o,$(wildcard $(CUDA_DIR)/*.cu)) +endif CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(OPTIONS) $(INCLUDES) # $(warning $(OBJ)) @@ -106,6 +110,11 @@ $(BUILD_DIR)/%.o: %.c $(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@ $(Q)$(CC) $(CPPFLAGS) -MT $@ -MM $< > $(BUILD_DIR)/$*.d +$(BUILD_DIR)/%-cuda.o: %.cu + $(info ===> COMPILE $@) + $(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@ + $(Q)$(CC) $(CPPFLAGS) -MT $@ -MM $< > $(BUILD_DIR)/$*.d + $(BUILD_DIR)/%.s: %.c $(info ===> GENERATE ASM $@) $(Q)$(CC) -S $(ASFLAGS) $(CPPFLAGS) $(CFLAGS) $< -o $@ diff --git a/config.mk b/config.mk index 8140400..a2da5f6 100644 --- a/config.mk +++ b/config.mk @@ -1,4 +1,4 @@ -# Compiler tag (GCC/CLANG/ICC/ONEAPI) +# Compiler tag (GCC/CLANG/ICC/ONEAPI/NVCC) TAG ?= ICC # Instruction set (SSE/AVX/AVX2/AVX512) ISA ?= AVX512 diff --git a/include_NVCC.mk b/include_NVCC.mk new file mode 100644 index 0000000..07edb9b --- /dev/null +++ b/include_NVCC.mk @@ -0,0 +1,16 @@ +CC = nvcc +LINKER = $(CC) + +ANSI_CFLAGS = -ansi +ANSI_CFLAGS += -std=c99 +ANSI_CFLAGS += -pedantic +ANSI_CFLAGS += -Wextra + +# CFLAGS = -O0 -g -std=c99 -fargument-noalias +#CFLAGS = -O3 -g -arch=sm_61 # -fopenmp +CFLAGS = -O3 -g # -fopenmp +ASFLAGS = -masm=intel +LFLAGS = +DEFINES = -D_GNU_SOURCE -DCUDA_TARGET -DNO_ZMM_INTRIN #-DLIKWID_PERFMON +INCLUDES = $(LIKWID_INC) +LIBS = -lm $(LIKWID_LIB) -lcuda -lcudart #-llikwid diff --git a/lammps/allocate.c b/lammps/allocate.c index 3aa4e1a..8c885c3 100644 --- a/lammps/allocate.c +++ b/lammps/allocate.c @@ -24,28 +24,24 @@ #include #include #include +#include -void* allocate (int alignment, size_t bytesize) -{ +void *allocate(int alignment, size_t bytesize) { int errorCode; void* ptr; - errorCode = posix_memalign(&ptr, alignment, bytesize); - - if (errorCode) { - if (errorCode == EINVAL) { - fprintf(stderr, - "Error: Alignment parameter is not a power of two\n"); - exit(EXIT_FAILURE); - } - if (errorCode == ENOMEM) { - fprintf(stderr, - "Error: Insufficient memory to fulfill the request\n"); - exit(EXIT_FAILURE); - } + errorCode = posix_memalign(&ptr, alignment, bytesize); + if(errorCode == EINVAL) { + fprintf(stderr, "Error: Alignment parameter is not a power of two\n"); + exit(EXIT_FAILURE); } - if (ptr == NULL) { + if(errorCode == ENOMEM) { + fprintf(stderr, "Error: Insufficient memory to fulfill the request\n"); + exit(EXIT_FAILURE); + } + + if(ptr == NULL) { fprintf(stderr, "Error: posix_memalign failed!\n"); exit(EXIT_FAILURE); } @@ -53,13 +49,8 @@ void* allocate (int alignment, size_t bytesize) return ptr; } -void* reallocate ( - void* ptr, - int alignment, - size_t newBytesize, - size_t oldBytesize) -{ - void* newarray = allocate(alignment, newBytesize); +void *reallocate(void* ptr, int alignment, size_t newBytesize, size_t oldBytesize) { + void *newarray = allocate(alignment, newBytesize); if(ptr != NULL) { memcpy(newarray, ptr, oldBytesize); @@ -68,3 +59,25 @@ void* reallocate ( return newarray; } + +#ifndef CUDA_TARGET +void *allocate_gpu(int alignment, size_t bytesize) { return NULL; } +void *reallocate_gpu(void *ptr, int alignment, size_t newBytesize, size_t oldBytesize) { return NULL; } +#else +void *allocate_gpu(int alignment, size_t bytesize) { + void *ptr; + checkCUDAError("allocate_gpu", cudaMallocHost((void **) &ptr, bytesize)); + return ptr; +} + +// Data is not preserved +void *reallocate_gpu(void *ptr, int alignment, size_t newBytesize, size_t oldBytesize) { + void *newarray = allocate_gpu(alignment, newBytesize); + + if(ptr != NULL) { + cudaFreeHost(ptr); + } + + return newarray; +} +#endif diff --git a/lammps/atom.c b/lammps/atom.c index bd8d871..4dd6829 100644 --- a/lammps/atom.c +++ b/lammps/atom.c @@ -31,6 +31,10 @@ #include #include +#ifdef CUDA_TARGET +#include +#endif + #define DELTA 20000 #ifndef MAXLINE @@ -67,6 +71,14 @@ void createAtom(Atom *atom, Parameter *param) { atom->Natoms = 4 * param->nx * param->ny * param->nz; atom->Nlocal = 0; atom->ntypes = param->ntypes; + +#ifdef CUDA_TARGET + checkCUDAError( "atom->epsilon cudaMallocHost", cudaMallocHost((void**)&(atom->epsilon), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); + checkCUDAError( "atom->sigma6 cudaMallocHost", cudaMallocHost((void**)&(atom->sigma6), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); + checkCUDAError( "atom->cutforcesq cudaMallocHost", cudaMallocHost((void**)&(atom->cutforcesq), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); + checkCUDAError( "atom->cutneighsq cudaMallocHost", cudaMallocHost((void**)&(atom->cutneighsq), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); +#endif + atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); diff --git a/lammps/cuda/atom.cu b/lammps/cuda/atom.cu new file mode 100644 index 0000000..a2767e5 --- /dev/null +++ b/lammps/cuda/atom.cu @@ -0,0 +1,76 @@ +/* + * ======================================================================================= + * + * Author: Jan Eitzinger (je), jan.eitzinger@fau.de + * Copyright (c) 2020 RRZE, University Erlangen-Nuremberg + * + * This file is part of MD-Bench. + * + * MD-Bench is free software: you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU Lesser General Public License for more + * details. + * + * You should have received a copy of the GNU Lesser General Public License along + * with MD-Bench. If not, see . + * ======================================================================================= + */ + +extern "C" { + +#include +#include +//--- +#include +#include +#include +#include + +void initCuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) { + c_atom->Natoms = atom->Natoms; + c_atom->Nlocal = atom->Nlocal; + c_atom->Nghost = atom->Nghost; + c_atom->Nmax = atom->Nmax; + c_atom->ntypes = atom->ntypes; + + c_atom->border_map = NULL; + + const int Nlocal = atom->Nlocal; + + checkCUDAError( "c_atom->x malloc", cudaMalloc((void**)&(c_atom->x), sizeof(MD_FLOAT) * atom->Nmax * 3) ); + checkCUDAError( "c_atom->x memcpy", cudaMemcpy(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) ); + + checkCUDAError( "c_atom->fx malloc", cudaMalloc((void**)&(c_atom->fx), sizeof(MD_FLOAT) * Nlocal * 3) ); + + checkCUDAError( "c_atom->vx malloc", cudaMalloc((void**)&(c_atom->vx), sizeof(MD_FLOAT) * Nlocal * 3) ); + checkCUDAError( "c_atom->vx memcpy", cudaMemcpy(c_atom->vx, atom->vx, sizeof(MD_FLOAT) * Nlocal * 3, cudaMemcpyHostToDevice) ); + + checkCUDAError( "c_atom->type malloc", cudaMalloc((void**)&(c_atom->type), sizeof(int) * atom->Nmax) ); + checkCUDAError( "c_atom->epsilon malloc", cudaMalloc((void**)&(c_atom->epsilon), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) ); + checkCUDAError( "c_atom->sigma6 malloc", cudaMalloc((void**)&(c_atom->sigma6), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) ); + checkCUDAError( "c_atom->cutforcesq malloc", cudaMalloc((void**)&(c_atom->cutforcesq), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) ); + + checkCUDAError( "c_neighbor->neighbors malloc", cudaMalloc((void**)&c_neighbor->neighbors, sizeof(int) * Nlocal * neighbor->maxneighs) ); + checkCUDAError( "c_neighbor->numneigh malloc", cudaMalloc((void**)&c_neighbor->numneigh, sizeof(int) * Nlocal) ); + + checkCUDAError( "c_atom->type memcpy", cudaMemcpy(c_atom->type, atom->type, sizeof(int) * atom->Nmax, cudaMemcpyHostToDevice) ); + checkCUDAError( "c_atom->sigma6 memcpy", cudaMemcpy(c_atom->sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) ); + checkCUDAError( "c_atom->epsilon memcpy", cudaMemcpy(c_atom->epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) ); + + checkCUDAError( "c_atom->cutforcesq memcpy", cudaMemcpy(c_atom->cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) ); +} + +void checkCUDAError(const char *msg, cudaError_t err) { + if (err != cudaSuccess) { + //print a human readable error message + printf("[CUDA ERROR %s]: %s\r\n", msg, cudaGetErrorString(err)); + exit(-1); + } +} + +} diff --git a/lammps/cuda/force.cu b/lammps/cuda/force.cu new file mode 100644 index 0000000..d142a30 --- /dev/null +++ b/lammps/cuda/force.cu @@ -0,0 +1,202 @@ +/* + * ======================================================================================= + * + * Author: Jan Eitzinger (je), jan.eitzinger@fau.de + * Copyright (c) 2021 RRZE, University Erlangen-Nuremberg + * + * This file is part of MD-Bench. + * + * MD-Bench is free software: you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU Lesser General Public License for more + * details. + * + * You should have received a copy of the GNU Lesser General Public License along + * with MD-Bench. If not, see . + * ======================================================================================= + */ +#include +#include +#include +#include +//--- +#include +#include +#include +//--- +#include + +extern "C" { + +#include +#include +#include +#include +#include +#include +#include + +} + +// cuda kernel +__global__ void calc_force(Atom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon, int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh) { + const int i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= Nlocal) { + return; + } + + Atom *atom = &a; + + const int numneighs = neigh_numneigh[i]; + + MD_FLOAT xtmp = atom_x(i); + MD_FLOAT ytmp = atom_y(i); + MD_FLOAT ztmp = atom_z(i); + + MD_FLOAT fix = 0; + MD_FLOAT fiy = 0; + MD_FLOAT fiz = 0; + + for(int k = 0; k < numneighs; k++) { + int j = neigh_neighbors[atom->Nlocal * k + i]; + MD_FLOAT delx = xtmp - atom_x(j); + MD_FLOAT dely = ytmp - atom_y(j); + MD_FLOAT delz = ztmp - atom_z(j); + MD_FLOAT rsq = delx * delx + dely * dely + delz * delz; + +#ifdef EXPLICIT_TYPES + const int type_j = atom->type[j]; + const int type_ij = type_i * atom->ntypes + type_j; + const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij]; + const MD_FLOAT sigma6 = atom->sigma6[type_ij]; + const MD_FLOAT epsilon = atom->epsilon[type_ij]; +#endif + + if(rsq < cutforcesq) { + MD_FLOAT sr2 = 1.0 / rsq; + MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6; + MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon; + fix += delx * force; + fiy += dely * force; + fiz += delz * force; + } + } + + atom_fx(i) = fix; + atom_fy(i) = fiy; + atom_fz(i) = fiz; +} + +__global__ void kernel_initial_integrate(MD_FLOAT dtforce, MD_FLOAT dt, int Nlocal, Atom a) { + const int i = blockIdx.x * blockDim.x + threadIdx.x; + if( i >= Nlocal ) { + return; + } + + Atom *atom = &a; + + atom_vx(i) += dtforce * atom_fx(i); + atom_vy(i) += dtforce * atom_fy(i); + atom_vz(i) += dtforce * atom_fz(i); + atom_x(i) = atom_x(i) + dt * atom_vx(i); + atom_y(i) = atom_y(i) + dt * atom_vy(i); + atom_z(i) = atom_z(i) + dt * atom_vz(i); +} + +__global__ void kernel_final_integrate(MD_FLOAT dtforce, int Nlocal, Atom a) { + const int i = blockIdx.x * blockDim.x + threadIdx.x; + if( i >= Nlocal ) { + return; + } + + Atom *atom = &a; + + atom_vx(i) += dtforce * atom_fx(i); + atom_vy(i) += dtforce * atom_fy(i); + atom_vz(i) += dtforce * atom_fz(i); +} + +extern "C" { + +void finalIntegrate_cuda(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom) { + const int Nlocal = atom->Nlocal; + const int num_threads_per_block = get_num_threads(); + const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block); + + kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, *c_atom); + + checkCUDAError( "PeekAtLastError FinalIntegrate", cudaPeekAtLastError() ); + checkCUDAError( "DeviceSync FinalIntegrate", cudaDeviceSynchronize() ); + + if(doReneighbour) { + checkCUDAError( "FinalIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom->vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) ); + } +} + +void initialIntegrate_cuda(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom) { + const int Nlocal = atom->Nlocal; + const int num_threads_per_block = get_num_threads(); + const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block); + + kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, *c_atom); + + checkCUDAError( "PeekAtLastError InitialIntegrate", cudaPeekAtLastError() ); + checkCUDAError( "DeviceSync InitialIntegrate", cudaDeviceSynchronize() ); + + if(doReneighbour) { + checkCUDAError( "InitialIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom->vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) ); + } +} + +double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) { + const int num_threads_per_block = get_num_threads(); + int Nlocal = atom->Nlocal; +#ifndef EXPLICIT_TYPES + MD_FLOAT cutforcesq = param->cutforce * param->cutforce; + MD_FLOAT sigma6 = param->sigma6; + MD_FLOAT epsilon = param->epsilon; +#endif + + /* + int nDevices; + cudaGetDeviceCount(&nDevices); + size_t free, total; + for(int i = 0; i < nDevices; ++i) { + cudaMemGetInfo( &free, &total ); + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, i); + printf("DEVICE %d/%d NAME: %s\r\n with %ld MB/%ld MB memory used", i + 1, nDevices, prop.name, free / 1024 / 1024, total / 1024 / 1024); + } + */ + + + // HINT: Run with cuda-memcheck ./MDBench-NVCC in case of error + + // checkCUDAError( "c_atom->fx memset", cudaMemset(c_atom->fx, 0, sizeof(MD_FLOAT) * Nlocal * 3) ); + + cudaProfilerStart(); + + const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block); + + double S = getTimeStamp(); + LIKWID_MARKER_START("force"); + + calc_force <<< num_blocks, num_threads_per_block >>> (*c_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, c_neighbor->neighbors, c_neighbor->numneigh); + + checkCUDAError( "PeekAtLastError ComputeForce", cudaPeekAtLastError() ); + checkCUDAError( "DeviceSync ComputeForce", cudaDeviceSynchronize() ); + + cudaProfilerStop(); + + LIKWID_MARKER_STOP("force"); + double E = getTimeStamp(); + + return E-S; +} + +} diff --git a/lammps/cuda/neighbor.cu b/lammps/cuda/neighbor.cu new file mode 100644 index 0000000..e1ade7d --- /dev/null +++ b/lammps/cuda/neighbor.cu @@ -0,0 +1,329 @@ +/* + * ======================================================================================= + * + * Author: Jan Eitzinger (je), jan.eitzinger@fau.de + * Copyright (c) 2021 RRZE, University Erlangen-Nuremberg + * + * This file is part of MD-Bench. + * + * MD-Bench is free software: you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU Lesser General Public License for more + * details. + * + * You should have received a copy of the GNU Lesser General Public License along + * with MD-Bench. If not, see . + * ======================================================================================= + */ +#include +#include +#include +#include +#include +#include +//--- + +extern "C" { + +#include +#include +#include +#include +#include + +} + +static MD_FLOAT xprd, yprd, zprd; +static MD_FLOAT bininvx, bininvy, bininvz; +static int mbinxlo, mbinylo, mbinzlo; +static int nbinx, nbiny, nbinz; +static int mbinx, mbiny, mbinz; // n bins in x, y, z +static int mbins; //total number of bins +static int atoms_per_bin; // max atoms per bin +static MD_FLOAT cutneighsq; // neighbor cutoff squared +static int nmax; +static int nstencil; // # of bins in stencil +static int* stencil; // stencil list of bin offsets +static int* c_stencil = NULL; +static int* c_resize_needed = NULL; +static int* c_new_maxneighs = NULL; +static Binning c_binning { + .bincount = NULL, + .bins = NULL, + .mbins = 0, + .atoms_per_bin = 0 +}; + + +__device__ int coord2bin_device(MD_FLOAT xin, MD_FLOAT yin, MD_FLOAT zin, Neighbor_params np) { + int ix, iy, iz; + + if(xin >= np.xprd) { + ix = (int)((xin - np.xprd) * np.bininvx) + np.nbinx - np.mbinxlo; + } else if(xin >= 0.0) { + ix = (int)(xin * np.bininvx) - np.mbinxlo; + } else { + ix = (int)(xin * np.bininvx) - np.mbinxlo - 1; + } + + if(yin >= np.yprd) { + iy = (int)((yin - np.yprd) * np.bininvy) + np.nbiny - np.mbinylo; + } else if(yin >= 0.0) { + iy = (int)(yin * np.bininvy) - np.mbinylo; + } else { + iy = (int)(yin * np.bininvy) - np.mbinylo - 1; + } + + if(zin >= np.zprd) { + iz = (int)((zin - np.zprd) * np.bininvz) + np.nbinz - np.mbinzlo; + } else if(zin >= 0.0) { + iz = (int)(zin * np.bininvz) - np.mbinzlo; + } else { + iz = (int)(zin * np.bininvz) - np.mbinzlo - 1; + } + + return (iz * np.mbiny * np.mbinx + iy * np.mbinx + ix + 1); +} + +/* sorts the contents of a bin to make it comparable to the CPU version */ +/* uses bubble sort since atoms per bin should be relatively small and can be done in situ */ +__global__ void sort_bin_contents_kernel(int* bincount, int* bins, int mbins, int atoms_per_bin){ + const int i = blockIdx.x * blockDim.x + threadIdx.x; + if(i >= mbins) { + return; + } + + int atoms_in_bin = bincount[i]; + int *bin_ptr = &bins[i * atoms_per_bin]; + int sorted; + do { + sorted = 1; + int tmp; + for(int index = 0; index < atoms_in_bin - 1; index++){ + if (bin_ptr[index] > bin_ptr[index + 1]){ + tmp = bin_ptr[index]; + bin_ptr[index] = bin_ptr[index + 1]; + bin_ptr[index + 1] = tmp; + sorted = 0; + } + } + } while (!sorted); +} + +__global__ void binatoms_kernel(Atom a, int* bincount, int* bins, int atoms_per_bin, Neighbor_params np, int *resize_needed){ + Atom* atom = &a; + const int i = blockIdx.x * blockDim.x + threadIdx.x; + int nall = atom->Nlocal + atom->Nghost; + if(i >= nall){ + return; + } + + MD_FLOAT x = atom_x(i); + MD_FLOAT y = atom_y(i); + MD_FLOAT z = atom_z(i); + int ibin = coord2bin_device(x, y, z, np); + + int ac = atomicAdd(&bincount[ibin], 1); + + if(ac < atoms_per_bin){ + bins[ibin * atoms_per_bin + ac] = i; + } else { + atomicMax(resize_needed, ac); + } +} + +__global__ void compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np, int nstencil, int* stencil, + int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq){ + const int i = blockIdx.x * blockDim.x + threadIdx.x; + const int Nlocal = a.Nlocal; + if( i >= Nlocal ) { + return; + } + + Atom *atom = &a; + Neighbor *neighbor = &neigh; + + int* neighptr = &(neighbor->neighbors[i]); + int n = 0; + MD_FLOAT xtmp = atom_x(i); + MD_FLOAT ytmp = atom_y(i); + MD_FLOAT ztmp = atom_z(i); + int ibin = coord2bin_device(xtmp, ytmp, ztmp, np); +#ifdef EXPLICIT_TYPES + int type_i = atom->type[i]; +#endif + for(int k = 0; k < nstencil; k++) { + int jbin = ibin + stencil[k]; + int* loc_bin = &bins[jbin * atoms_per_bin]; + + for(int m = 0; m < bincount[jbin]; m++) { + int j = loc_bin[m]; + + if ( j == i ){ + continue; + } + + MD_FLOAT delx = xtmp - atom_x(j); + MD_FLOAT dely = ytmp - atom_y(j); + MD_FLOAT delz = ztmp - atom_z(j); + MD_FLOAT rsq = delx * delx + dely * dely + delz * delz; + +#ifdef EXPLICIT_TYPES + int type_j = atom->type[j]; + const MD_FLOAT cutoff = atom->cutneighsq[type_i * atom->ntypes + type_j]; +#else + const MD_FLOAT cutoff = cutneighsq; +#endif + + if( rsq <= cutoff ) { + int idx = atom->Nlocal * n; + neighptr[idx] = j; + n += 1; + } + } + } + + neighbor->numneigh[i] = n; + + if(n > neighbor->maxneighs) { + atomicMax(new_maxneighs, n); + } +} + +void binatoms_cuda(Atom *c_atom, Binning *c_binning, int *c_resize_needed, Neighbor_params *np, const int threads_per_block) { + int nall = c_atom->Nlocal + c_atom->Nghost; + int resize = 1; + const int num_blocks = ceil((float) nall / (float) threads_per_block); + + while(resize > 0) { + resize = 0; + checkCUDAError("binatoms_cuda c_binning->bincount memset", cudaMemset(c_binning->bincount, 0, c_binning->mbins * sizeof(int))); + checkCUDAError("binatoms_cuda c_resize_needed memset", cudaMemset(c_resize_needed, 0, sizeof(int)) ); + + /*binatoms_kernel(Atom a, int* bincount, int* bins, int c_binning->atoms_per_bin, Neighbor_params np, int *resize_needed) */ + binatoms_kernel<<>>(*c_atom, c_binning->bincount, c_binning->bins, c_binning->atoms_per_bin, *np, c_resize_needed); + + checkCUDAError( "PeekAtLastError binatoms kernel", cudaPeekAtLastError() ); + checkCUDAError( "DeviceSync binatoms kernel", cudaDeviceSynchronize() ); + + checkCUDAError("binatoms_cuda c_resize_needed memcpy back", cudaMemcpy(&resize, c_resize_needed, sizeof(int), cudaMemcpyDeviceToHost) ); + + if(resize) { + cudaFree(c_binning->bins); + c_binning->atoms_per_bin *= 2; + checkCUDAError("binatoms_cuda c_binning->bins resize malloc", cudaMalloc(&c_binning->bins, c_binning->mbins * c_binning->atoms_per_bin * sizeof(int)) ); + } + } + + atoms_per_bin = c_binning->atoms_per_bin; + const int sortBlocks = ceil((float)mbins / (float)threads_per_block); + /*void sort_bin_contents_kernel(int* bincount, int* bins, int mbins, int atoms_per_bin)*/ + sort_bin_contents_kernel<<>>(c_binning->bincount, c_binning->bins, c_binning->mbins, c_binning->atoms_per_bin); + checkCUDAError( "PeekAtLastError sort_bin_contents kernel", cudaPeekAtLastError() ); + checkCUDAError( "DeviceSync sort_bin_contents kernel", cudaDeviceSynchronize() ); +} + +void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) { + const int num_threads_per_block = get_num_threads(); + int nall = atom->Nlocal + atom->Nghost; + c_neighbor->maxneighs = neighbor->maxneighs; + + cudaProfilerStart(); + /* upload stencil */ + // TODO move all of this initialization into its own method + if(c_stencil == NULL){ + checkCUDAError( "buildNeighbor c_n_stencil malloc", cudaMalloc((void**)&c_stencil, nstencil * sizeof(int)) ); + checkCUDAError( "buildNeighbor c_n_stencil memcpy", cudaMemcpy(c_stencil, stencil, nstencil * sizeof(int), cudaMemcpyHostToDevice )); + } + + if(c_binning.mbins == 0){ + c_binning.mbins = mbins; + c_binning.atoms_per_bin = atoms_per_bin; + checkCUDAError( "buildNeighbor c_binning->bincount malloc", cudaMalloc((void**)&(c_binning.bincount), c_binning.mbins * sizeof(int)) ); + checkCUDAError( "buidlNeighbor c_binning->bins malloc", cudaMalloc((void**)&(c_binning.bins), c_binning.mbins * c_binning.atoms_per_bin * sizeof(int)) ); + } + + Neighbor_params np { + .xprd = xprd, + .yprd = yprd, + .zprd = zprd, + .bininvx = bininvx, + .bininvy = bininvy, + .bininvz = bininvz, + .mbinxlo = mbinxlo, + .mbinylo = mbinylo, + .mbinzlo = mbinzlo, + .nbinx = nbinx, + .nbiny = nbiny, + .nbinz = nbinz, + .mbinx = mbinx, + .mbiny = mbiny, + .mbinz = mbinz + }; + + if(c_resize_needed == NULL){ + checkCUDAError("buildNeighbor c_resize_needed malloc", cudaMalloc((void**)&c_resize_needed, sizeof(int)) ); + } + + /* bin local & ghost atoms */ + binatoms_cuda(c_atom, &c_binning, c_resize_needed, &np, num_threads_per_block); + if(c_new_maxneighs == NULL){ + checkCUDAError("c_new_maxneighs malloc", cudaMalloc((void**)&c_new_maxneighs, sizeof(int) )); + } + + int resize = 1; + + /* extend c_neighbor arrays if necessary */ + if(nall > nmax) { + nmax = nall; + if(c_neighbor->numneigh) cudaFree(c_neighbor->numneigh); + if(c_neighbor->neighbors) cudaFree(c_neighbor->neighbors); + checkCUDAError( "buildNeighbor c_numneigh malloc", cudaMalloc((void**)&(c_neighbor->numneigh), nmax * sizeof(int)) ); + checkCUDAError( "buildNeighbor c_neighbors malloc", cudaMalloc((void**)&(c_neighbor->neighbors), nmax * c_neighbor->maxneighs * sizeof(int)) ); + } + + /* loop over each atom, storing neighbors */ + while(resize) { + resize = 0; + + checkCUDAError("c_new_maxneighs memset", cudaMemset(c_new_maxneighs, 0, sizeof(int) )); + + // TODO call compute_neigborhood kernel here + const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block); + /*compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np, int nstencil, int* stencil, + int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs) + * */ + compute_neighborhood<<>>(*c_atom, *c_neighbor, + np, nstencil, c_stencil, + c_binning.bins, c_binning.atoms_per_bin, c_binning.bincount, + c_new_maxneighs, + cutneighsq); + + checkCUDAError( "PeekAtLastError ComputeNeighbor", cudaPeekAtLastError() ); + checkCUDAError( "DeviceSync ComputeNeighbor", cudaDeviceSynchronize() ); + + // TODO copy the value of c_new_maxneighs back to host and check if it has been modified + int new_maxneighs; + checkCUDAError("c_new_maxneighs memcpy back", cudaMemcpy(&new_maxneighs, c_new_maxneighs, sizeof(int), cudaMemcpyDeviceToHost)); + if (new_maxneighs > c_neighbor->maxneighs){ + resize = 1; + } + + if(resize) { + printf("RESIZE %d\n", c_neighbor->maxneighs); + c_neighbor->maxneighs = new_maxneighs * 1.2; + printf("NEW SIZE %d\n", c_neighbor->maxneighs); + cudaFree(c_neighbor->neighbors); + checkCUDAError("c_neighbor->neighbors resize malloc", cudaMalloc((void**)(&c_neighbor->neighbors), c_atom->Nmax * c_neighbor->maxneighs * sizeof(int))); + } + + } + neighbor->maxneighs = c_neighbor->maxneighs; + + cudaProfilerStop(); +} diff --git a/lammps/cuda/pbc.cu b/lammps/cuda/pbc.cu new file mode 100644 index 0000000..24b680b --- /dev/null +++ b/lammps/cuda/pbc.cu @@ -0,0 +1,151 @@ +/* + * ======================================================================================= + * + * Author: Jan Eitzinger (je), jan.eitzinger@fau.de + * Copyright (c) 2020 RRZE, University Erlangen-Nuremberg + * + * This file is part of MD-Bench. + * + * MD-Bench is free software: you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU Lesser General Public License for more + * details. + * + * You should have received a copy of the GNU Lesser General Public License along + * with MD-Bench. If not, see . + * ======================================================================================= + */ +#include +#include +//--- + +extern "C" { + +#include +#include +#include +#include +#include + +} + +static int NmaxGhost; +static int *PBCx, *PBCy, *PBCz; +static int c_NmaxGhost = 0; +static int *c_PBCx = NULL, *c_PBCy = NULL, *c_PBCz = NULL; + + +__global__ void computeAtomsPbcUpdate(Atom a, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd){ + const int i = blockIdx.x * blockDim.x + threadIdx.x; + Atom* atom = &a; + if( i >= atom->Nlocal ){ + return; + } + + if (atom_x(i) < 0.0) { + atom_x(i) += xprd; + } else if (atom_x(i) >= xprd) { + atom_x(i) -= xprd; + } + + if (atom_y(i) < 0.0) { + atom_y(i) += yprd; + } else if (atom_y(i) >= yprd) { + atom_y(i) -= yprd; + } + + if (atom_z(i) < 0.0) { + atom_z(i) += zprd; + } else if (atom_z(i) >= zprd) { + atom_z(i) -= zprd; + } +} + +__global__ void computePbcUpdate(Atom a, int* PBCx, int* PBCy, int* PBCz, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd){ + const int i = blockIdx.x * blockDim.x + threadIdx.x; + const int Nghost = a.Nghost; + if( i >= Nghost ) { + return; + } + Atom* atom = &a; + int *border_map = atom->border_map; + int nlocal = atom->Nlocal; + + atom_x(nlocal + i) = atom_x(border_map[i]) + PBCx[i] * xprd; + atom_y(nlocal + i) = atom_y(border_map[i]) + PBCy[i] * yprd; + atom_z(nlocal + i) = atom_z(border_map[i]) + PBCz[i] * zprd; +} + +/* update coordinates of ghost atoms */ +/* uses mapping created in setupPbc */ +void updatePbc_cuda(Atom *atom, Atom *c_atom, Parameter *param, bool doReneighbor) { + const int num_threads_per_block = get_num_threads(); + + if (doReneighbor){ + c_atom->Natoms = atom->Natoms; + c_atom->Nlocal = atom->Nlocal; + c_atom->Nghost = atom->Nghost; + c_atom->ntypes = atom->ntypes; + + if (atom->Nmax > c_atom->Nmax){ // the number of ghost atoms has increased -> more space is needed + c_atom->Nmax = atom->Nmax; + if(c_atom->x != NULL){ cudaFree(c_atom->x); } + if(c_atom->type != NULL){ cudaFree(c_atom->type); } + checkCUDAError( "updatePbc c_atom->x malloc", cudaMalloc((void**)&(c_atom->x), sizeof(MD_FLOAT) * atom->Nmax * 3) ); + checkCUDAError( "updatePbc c_atom->type malloc", cudaMalloc((void**)&(c_atom->type), sizeof(int) * atom->Nmax) ); + } + // TODO if the sort is reactivated the atom->vx needs to be copied to GPU as well + checkCUDAError( "updatePbc c_atom->x memcpy", cudaMemcpy(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) ); + checkCUDAError( "updatePbc c_atom->type memcpy", cudaMemcpy(c_atom->type, atom->type, sizeof(int) * atom->Nmax, cudaMemcpyHostToDevice) ); + + if(c_NmaxGhost < NmaxGhost){ + c_NmaxGhost = NmaxGhost; + if(c_PBCx != NULL){ cudaFree(c_PBCx); } + if(c_PBCy != NULL){ cudaFree(c_PBCy); } + if(c_PBCz != NULL){ cudaFree(c_PBCz); } + if(c_atom->border_map != NULL){ cudaFree(c_atom->border_map); } + checkCUDAError( "updatePbc c_PBCx malloc", cudaMalloc((void**)&c_PBCx, NmaxGhost * sizeof(int)) ); + checkCUDAError( "updatePbc c_PBCy malloc", cudaMalloc((void**)&c_PBCy, NmaxGhost * sizeof(int)) ); + checkCUDAError( "updatePbc c_PBCz malloc", cudaMalloc((void**)&c_PBCz, NmaxGhost * sizeof(int)) ); + checkCUDAError( "updatePbc c_atom->border_map malloc", cudaMalloc((void**)&(c_atom->border_map), NmaxGhost * sizeof(int)) ); + } + checkCUDAError( "updatePbc c_PBCx memcpy", cudaMemcpy(c_PBCx, PBCx, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) ); + checkCUDAError( "updatePbc c_PBCy memcpy", cudaMemcpy(c_PBCy, PBCy, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) ); + checkCUDAError( "updatePbc c_PBCz memcpy", cudaMemcpy(c_PBCz, PBCz, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) ); + checkCUDAError( "updatePbc c_atom->border_map memcpy", cudaMemcpy(c_atom->border_map, atom->border_map, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) ); + } + + MD_FLOAT xprd = param->xprd; + MD_FLOAT yprd = param->yprd; + MD_FLOAT zprd = param->zprd; + + const int num_blocks = ceil((float)atom->Nghost / (float)num_threads_per_block); + + /*__global__ void computePbcUpdate(Atom a, int* PBCx, int* PBCy, int* PBCz, + * MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd) + * */ + computePbcUpdate<<>>(*c_atom, c_PBCx, c_PBCy, c_PBCz, xprd, yprd, zprd); + checkCUDAError( "PeekAtLastError UpdatePbc", cudaPeekAtLastError() ); + checkCUDAError( "DeviceSync UpdatePbc", cudaDeviceSynchronize() ); +} + +void updateAtomsPbc_cuda(Atom* atom, Atom *c_atom, Parameter *param){ + const int num_threads_per_block = get_num_threads(); + MD_FLOAT xprd = param->xprd; + MD_FLOAT yprd = param->yprd; + MD_FLOAT zprd = param->zprd; + + const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block); + /*void computeAtomsPbcUpdate(Atom a, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd)*/ + computeAtomsPbcUpdate<<>>(*c_atom, xprd, yprd, zprd); + + checkCUDAError( "PeekAtLastError UpdateAtomsPbc", cudaPeekAtLastError() ); + checkCUDAError( "DeviceSync UpdateAtomsPbc", cudaDeviceSynchronize() ); + + checkCUDAError( "updateAtomsPbc position memcpy back", cudaMemcpy(atom->x, c_atom->x, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) ); +} diff --git a/lammps/includes/atom.h b/lammps/includes/atom.h index 8c9a534..4136e24 100644 --- a/lammps/includes/atom.h +++ b/lammps/includes/atom.h @@ -25,6 +25,29 @@ #ifndef __ATOM_H_ #define __ATOM_H_ +#ifdef CUDA_TARGET +# define KERNEL_NAME "CUDA" +# define computeForceLJFullNeigh computeForceLJFullNeigh_cuda +# define initialIntegrate initialIntegrate_cuda +# define finalIntegrate finalIntegrate_cuda +# define buildNeighbor buildNeighbor_cuda +# define updatePbc updatePbc_cuda +# define updateAtomsPbc updateAtomsPbc_cuda +#else +# ifdef USE_SIMD_KERNEL +# define KERNEL_NAME "SIMD" +# define computeForceLJFullNeigh computeForceLJFullNeigh_simd +# else +# define KERNEL_NAME "plain-C" +# define computeForceLJFullNeigh computeForceLJFullNeigh_plain_c +# endif +# define initialIntegrate initialIntegrate_cpu +# define finalIntegrate finalIntegrate_cpu +# define buildNeighbor buildNeighbor_cpu +# define updatePbc updatePbc_cpu +# define updateAtomsPbc updateAtomsPbc_cpu +#endif + typedef struct { int Natoms, Nlocal, Nghost, Nmax; MD_FLOAT *x, *y, *z; diff --git a/lammps/includes/cuda_atom.h b/lammps/includes/cuda_atom.h new file mode 100644 index 0000000..232daec --- /dev/null +++ b/lammps/includes/cuda_atom.h @@ -0,0 +1,10 @@ +#include +//--- +#include +#include + +#ifndef __CUDA_ATOM_H_ +#define __CUDA_ATOM_H_ +extern void initCuda(Atom*, Neighbor*, Atom*, Neighbor*); +extern void checkCUDAError(const char *msg, cudaError_t err); +#endif diff --git a/lammps/includes/integrate.h b/lammps/includes/integrate.h new file mode 100644 index 0000000..2802595 --- /dev/null +++ b/lammps/includes/integrate.h @@ -0,0 +1,48 @@ +/* + * ======================================================================================= + * + * Author: Jan Eitzinger (je), jan.eitzinger@fau.de + * Copyright (c) 2020 RRZE, University Erlangen-Nuremberg + * + * This file is part of MD-Bench. + * + * MD-Bench is free software: you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A + * PARTICULAR PURPOSE. See the GNU Lesser General Public License for more + * details. + * + * You should have received a copy of the GNU Lesser General Public License along + * with MD-Bench. If not, see . + * ======================================================================================= + */ +#include +#include + +void initialIntegrate_cpu(bool reneigh, Parameter *param, Atom *atom, Atom *c_atom) { + for(int i = 0; i < atom->Nlocal; i++) { + atom_vx(i) += param->dtforce * atom_fx(i); + atom_vy(i) += param->dtforce * atom_fy(i); + atom_vz(i) += param->dtforce * atom_fz(i); + atom_x(i) = atom_x(i) + param->dt * atom_vx(i); + atom_y(i) = atom_y(i) + param->dt * atom_vy(i); + atom_z(i) = atom_z(i) + param->dt * atom_vz(i); + } +} + +void finalIntegrate_cpu(bool reneigh, Parameter *param, Atom *atom, Atom *c_atom) { + for(int i = 0; i < atom->Nlocal; i++) { + atom_vx(i) += param->dtforce * atom_fx(i); + atom_vy(i) += param->dtforce * atom_fy(i); + atom_vz(i) += param->dtforce * atom_fz(i); + } +} + +#ifdef CUDA_TARGET +void initialIntegrate_cuda(bool, Parameter*, Atom*, Atom*); +void finalIntegrate_cuda(bool, Parameter*, Atom*, Atom*); +#endif diff --git a/lammps/includes/neighbor.h b/lammps/includes/neighbor.h index 193b848..52de07c 100644 --- a/lammps/includes/neighbor.h +++ b/lammps/includes/neighbor.h @@ -34,9 +34,29 @@ typedef struct { int* numneigh; } Neighbor; +typedef struct { + MD_FLOAT xprd; MD_FLOAT yprd; MD_FLOAT zprd; + MD_FLOAT bininvx; MD_FLOAT bininvy; MD_FLOAT bininvz; + int mbinxlo; int mbinylo; int mbinzlo; + int nbinx; int nbiny; int nbinz; + int mbinx; int mbiny; int mbinz; +} Neighbor_params; + +typedef struct { + int* bincount; + int* bins; + int mbins; + int atoms_per_bin; +} Binning; + extern void initNeighbor(Neighbor*, Parameter*); extern void setupNeighbor(Parameter*); extern void binatoms(Atom*); -extern void buildNeighbor(Atom*, Neighbor*); +extern void buildNeighbor_cpu(Atom*, Neighbor*, Atom*, Neighbor*); extern void sortAtom(Atom*); + +#ifdef CUDA_TARGET +extern void buildNeighbor_cuda(Atom*, Neighbor*, Atom*, Neighbor*); +#endif + #endif diff --git a/lammps/includes/pbc.h b/lammps/includes/pbc.h index 727ac80..df6e46d 100644 --- a/lammps/includes/pbc.h +++ b/lammps/includes/pbc.h @@ -20,13 +20,21 @@ * with MD-Bench. If not, see . * ======================================================================================= */ +#include +//--- #include #include #ifndef __PBC_H_ #define __PBC_H_ extern void initPbc(); -extern void updatePbc(Atom*, Parameter*); -extern void updateAtomsPbc(Atom*, Parameter*); +extern void updatePbc_cpu(Atom*, Atom*, Parameter*, bool); +extern void updateAtomsPbc_cpu(Atom*, Atom*, Parameter*); extern void setupPbc(Atom*, Parameter*); + +#ifdef CUDA_TARGET +extern void updatePbc_cuda(Atom*, Atom*, Parameter*, bool); +extern void updateAtomsPbc_cuda(Atom*, Atom*, Parameter*); +#endif + #endif diff --git a/lammps/includes/util.h b/lammps/includes/util.h index e317afb..3f188b2 100644 --- a/lammps/includes/util.h +++ b/lammps/includes/util.h @@ -50,4 +50,6 @@ extern double myrandom(int*); extern void random_reset(int *seed, int ibase, double *coord); extern int str2ff(const char *string); extern const char* ff2str(int ff); +extern int get_num_threads(); + #endif diff --git a/lammps/main.c b/lammps/main.c index c372470..1791ddf 100644 --- a/lammps/main.c +++ b/lammps/main.c @@ -42,6 +42,7 @@ #include #include #include +#include #define HLINE "----------------------------------------------------------------------------\n" @@ -51,15 +52,12 @@ extern double computeForceLJHalfNeigh(Parameter*, Atom*, Neighbor*, Stats*); extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*); extern double computeForceDemFullNeigh(Parameter*, Atom*, Neighbor*, Stats*); -#ifdef USE_SIMD_KERNEL -# define KERNEL_NAME "SIMD" -# define computeForceLJFullNeigh computeForceLJFullNeigh_simd -#else -# define KERNEL_NAME "plain-C" -# define computeForceLJFullNeigh computeForceLJFullNeigh_plain_c +#ifdef CUDA_TARGET +#include +extern double computeForceLJFullNeigh_cuda(Parameter*, Atom*, Neighbor*, Atom*, Neighbor*); #endif -double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) { +double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor, Stats *stats) { if(param->force_field == FF_EAM) { initEam(eam, param); } double S, E; param->lattice = pow((4.0 / param->rho), (1.0 / 3.0)); @@ -82,45 +80,29 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats * setupThermo(param, atom->Natoms); if(param->input_file == NULL) { adjustThermo(param, atom); } setupPbc(atom, param); - updatePbc(atom, param); - buildNeighbor(atom, neighbor); + #ifdef CUDA_TARGET + initCuda(atom, neighbor, c_atom, c_neighbor); + #endif + updatePbc(atom, c_atom, param, true); + buildNeighbor(atom, neighbor, c_atom, c_neighbor); E = getTimeStamp(); return E-S; } -double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) { +double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) { double S, E; S = getTimeStamp(); LIKWID_MARKER_START("reneighbour"); - updateAtomsPbc(atom, param); + updateAtomsPbc(atom, c_atom, param); setupPbc(atom, param); - updatePbc(atom, param); + updatePbc(atom, c_atom, param, true); //sortAtom(atom); - buildNeighbor(atom, neighbor); + buildNeighbor(atom, neighbor, c_atom, c_neighbor); LIKWID_MARKER_STOP("reneighbour"); E = getTimeStamp(); return E-S; } -void initialIntegrate(Parameter *param, Atom *atom) { - for(int i = 0; i < atom->Nlocal; i++) { - atom_vx(i) += param->dtforce * atom_fx(i); - atom_vy(i) += param->dtforce * atom_fy(i); - atom_vz(i) += param->dtforce * atom_fz(i); - atom_x(i) = atom_x(i) + param->dt * atom_vx(i); - atom_y(i) = atom_y(i) + param->dt * atom_vy(i); - atom_z(i) = atom_z(i) + param->dt * atom_vz(i); - } -} - -void finalIntegrate(Parameter *param, Atom *atom) { - for(int i = 0; i < atom->Nlocal; i++) { - atom_vx(i) += param->dtforce * atom_fx(i); - atom_vy(i) += param->dtforce * atom_fy(i); - atom_vz(i) += param->dtforce * atom_fz(i); - } -} - void printAtomState(Atom *atom) { printf("Atom counts: Natoms=%d Nlocal=%d Nghost=%d Nmax=%d\n", atom->Natoms, atom->Nlocal, atom->Nghost, atom->Nmax); // int nall = atom->Nlocal + atom->Nghost; @@ -129,7 +111,7 @@ void printAtomState(Atom *atom) { // } } -double computeForce(Eam *eam, Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { +double computeForce(Eam *eam, Parameter *param, Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor, Stats *stats) { if(param->force_field == FF_EAM) { return computeForceEam(eam, param, atom, neighbor, stats); } else if(param->force_field == FF_DEM) { @@ -145,14 +127,18 @@ double computeForce(Eam *eam, Parameter *param, Atom *atom, Neighbor *neighbor, return computeForceLJHalfNeigh(param, atom, neighbor, stats); } + #ifdef CUDA_TARGET + return computeForceLJFullNeigh(param, atom, neighbor, c_atom, c_neighbor); + #else return computeForceLJFullNeigh(param, atom, neighbor, stats); + #endif } int main(int argc, char** argv) { double timer[NUMTIMER]; Eam eam; - Atom atom; - Neighbor neighbor; + Atom atom, c_atom; + Neighbor neighbor, c_neighbor; Stats stats; Parameter param; @@ -240,16 +226,16 @@ int main(int argc, char** argv) { } param.cutneigh = param.cutforce + param.skin; - setup(¶m, &eam, &atom, &neighbor, &stats); + setup(¶m, &eam, &atom, &neighbor, &c_atom, &c_neighbor, &stats); printParameter(¶m); printf("step\ttemp\t\tpressure\n"); computeThermo(0, ¶m, &atom); -#if defined(MEM_TRACER) || defined(INDEX_TRACER) + #if defined(MEM_TRACER) || defined(INDEX_TRACER) traceAddresses(¶m, &atom, &neighbor, n + 1); -#endif + #endif - timer[FORCE] = computeForce(&eam, ¶m, &atom, &neighbor, &stats); + timer[FORCE] = computeForce(&eam, ¶m, &atom, &neighbor, &c_atom, &c_neighbor, &stats); timer[NEIGH] = 0.0; timer[TOTAL] = getTimeStamp(); @@ -258,21 +244,26 @@ int main(int argc, char** argv) { } for(int n = 0; n < param.ntimes; n++) { - initialIntegrate(¶m, &atom); + bool reneigh = (n + 1) % param.reneigh_every == 0; + initialIntegrate(reneigh, ¶m, &atom, &c_atom); if((n + 1) % param.reneigh_every) { - updatePbc(&atom, ¶m); + updatePbc(&atom, &c_atom, ¶m, false); } else { - timer[NEIGH] += reneighbour(¶m, &atom, &neighbor); + timer[NEIGH] += reneighbour(¶m, &atom, &neighbor, &c_atom, &c_neighbor); } -#if defined(MEM_TRACER) || defined(INDEX_TRACER) + #if defined(MEM_TRACER) || defined(INDEX_TRACER) traceAddresses(¶m, &atom, &neighbor, n + 1); -#endif + #endif - timer[FORCE] += computeForce(&eam, ¶m, &atom, &neighbor, &stats); - finalIntegrate(¶m, &atom); + timer[FORCE] += computeForce(&eam, ¶m, &atom, &neighbor, &c_atom, &c_neighbor, &stats); + finalIntegrate(reneigh, ¶m, &atom, &c_atom); if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) { + #ifdef CUDA_TARGET + checkCUDAError("computeThermo atom->x memcpy back", cudaMemcpy(atom.x, c_atom.x, atom.Nmax * sizeof(MD_FLOAT) * 3, cudaMemcpyDeviceToHost)); + #endif + computeThermo(n + 1, ¶m, &atom); } @@ -287,11 +278,11 @@ int main(int argc, char** argv) { printf(HLINE); printf("Force field: %s\n", ff2str(param.force_field)); printf("Data layout for positions: %s\n", POS_DATA_LAYOUT); -#if PRECISION == 1 + #if PRECISION == 1 printf("Using single precision floating point.\n"); -#else + #else printf("Using double precision floating point.\n"); -#endif + #endif printf(HLINE); printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes); printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n", diff --git a/lammps/neighbor.c b/lammps/neighbor.c index 826a00c..cbdb150 100644 --- a/lammps/neighbor.c +++ b/lammps/neighbor.c @@ -169,7 +169,7 @@ void setupNeighbor(Parameter* param) { bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int)); } -void buildNeighbor(Atom *atom, Neighbor *neighbor) { +void buildNeighbor_cpu(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) { int nall = atom->Nlocal + atom->Nghost; /* extend atom arrays if necessary */ diff --git a/lammps/pbc.c b/lammps/pbc.c index 934dff9..d1e80bb 100644 --- a/lammps/pbc.c +++ b/lammps/pbc.c @@ -20,9 +20,10 @@ * with MD-Bench. If not, see . * ======================================================================================= */ +#include #include #include - +//--- #include #include #include @@ -43,7 +44,7 @@ void initPbc(Atom* atom) { /* update coordinates of ghost atoms */ /* uses mapping created in setupPbc */ -void updatePbc(Atom *atom, Parameter *param) { +void updatePbc_cpu(Atom *atom, Atom *c_atom, Parameter *param, bool doReneighbor) { int *border_map = atom->border_map; int nlocal = atom->Nlocal; MD_FLOAT xprd = param->xprd; @@ -59,7 +60,7 @@ void updatePbc(Atom *atom, Parameter *param) { /* relocate atoms that have left domain according * to periodic boundary conditions */ -void updateAtomsPbc(Atom *atom, Parameter *param) { +void updateAtomsPbc_cpu(Atom *atom, Atom *c_atom, Parameter *param) { MD_FLOAT xprd = param->xprd; MD_FLOAT yprd = param->yprd; MD_FLOAT zprd = param->zprd; diff --git a/lammps/util.c b/lammps/util.c index a41c5a9..261c172 100644 --- a/lammps/util.c +++ b/lammps/util.c @@ -32,8 +32,7 @@ #define IR 2836 #define MASK 123459876 -double myrandom(int* seed) -{ +double myrandom(int* seed) { int k= (*seed) / IQ; double ans; @@ -43,8 +42,7 @@ double myrandom(int* seed) return ans; } -void random_reset(int *seed, int ibase, double *coord) -{ +void random_reset(int *seed, int ibase, double *coord) { int i; char *str = (char *) &ibase; int n = sizeof(int); @@ -80,18 +78,21 @@ void random_reset(int *seed, int ibase, double *coord) //save = 0; } -int str2ff(const char *string) -{ +int str2ff(const char *string) { if(strncmp(string, "lj", 2) == 0) return FF_LJ; if(strncmp(string, "eam", 3) == 0) return FF_EAM; if(strncmp(string, "dem", 3) == 0) return FF_DEM; return -1; } -const char* ff2str(int ff) -{ +const char* ff2str(int ff) { if(ff == FF_LJ) { return "lj"; } if(ff == FF_EAM) { return "eam"; } if(ff == FF_DEM) { return "dem"; } return "invalid"; } + +int get_num_threads() { + const char *num_threads_env = getenv("NUM_THREADS"); + return (num_threads_env == NULL) ? 32 : atoi(num_threads_env); +}