Integrate LAMMPS CUDA versions into master branch
Signed-off-by: Rafael Ravedutti <rafaelravedutti@gmail.com>
This commit is contained in:
parent
eb77e1a3bd
commit
c18124b066
17
Makefile
17
Makefile
@ -3,6 +3,7 @@ TARGET = MDBench-$(TAG)-$(OPT_SCHEME)
|
|||||||
BUILD_DIR = ./$(TAG)-$(OPT_SCHEME)
|
BUILD_DIR = ./$(TAG)-$(OPT_SCHEME)
|
||||||
SRC_DIR = ./$(OPT_SCHEME)
|
SRC_DIR = ./$(OPT_SCHEME)
|
||||||
ASM_DIR = ./asm
|
ASM_DIR = ./asm
|
||||||
|
CUDA_DIR = ./$(SRC_DIR)/cuda
|
||||||
MAKE_DIR = ./
|
MAKE_DIR = ./
|
||||||
Q ?= @
|
Q ?= @
|
||||||
|
|
||||||
@ -15,12 +16,12 @@ include $(MAKE_DIR)/include_GROMACS.mk
|
|||||||
INCLUDES += -I./$(SRC_DIR)/includes
|
INCLUDES += -I./$(SRC_DIR)/includes
|
||||||
|
|
||||||
ifeq ($(strip $(DATA_LAYOUT)),AOS)
|
ifeq ($(strip $(DATA_LAYOUT)),AOS)
|
||||||
DEFINES += -DAOS
|
DEFINES += -DAOS
|
||||||
endif
|
endif
|
||||||
ifeq ($(strip $(DATA_TYPE)),SP)
|
ifeq ($(strip $(DATA_TYPE)),SP)
|
||||||
DEFINES += -DPRECISION=1
|
DEFINES += -DPRECISION=1
|
||||||
else
|
else
|
||||||
DEFINES += -DPRECISION=2
|
DEFINES += -DPRECISION=2
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifneq ($(ASM_SYNTAX), ATT)
|
ifneq ($(ASM_SYNTAX), ATT)
|
||||||
@ -79,11 +80,14 @@ ifeq ($(strip $(USE_SIMD_KERNEL)),true)
|
|||||||
DEFINES += -DUSE_SIMD_KERNEL
|
DEFINES += -DUSE_SIMD_KERNEL
|
||||||
endif
|
endif
|
||||||
|
|
||||||
VPATH = $(SRC_DIR) $(ASM_DIR)
|
VPATH = $(SRC_DIR) $(ASM_DIR) $(CUDA_DIR)
|
||||||
ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
|
ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
|
||||||
OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s))
|
OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s))
|
||||||
OBJ = $(filter-out $(BUILD_DIR)/main% $(OVERWRITE),$(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c)))
|
OBJ = $(filter-out $(BUILD_DIR)/main% $(OVERWRITE),$(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c)))
|
||||||
OBJ += $(patsubst $(ASM_DIR)/%.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*.s))
|
OBJ += $(patsubst $(ASM_DIR)/%.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*.s))
|
||||||
|
ifeq ($(strip $(TAG)),NVCC)
|
||||||
|
OBJ += $(patsubst $(CUDA_DIR)/%.cu, $(BUILD_DIR)/%-cuda.o,$(wildcard $(CUDA_DIR)/*.cu))
|
||||||
|
endif
|
||||||
CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(OPTIONS) $(INCLUDES)
|
CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(OPTIONS) $(INCLUDES)
|
||||||
|
|
||||||
# $(warning $(OBJ))
|
# $(warning $(OBJ))
|
||||||
@ -106,6 +110,11 @@ $(BUILD_DIR)/%.o: %.c
|
|||||||
$(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
|
$(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
|
||||||
$(Q)$(CC) $(CPPFLAGS) -MT $@ -MM $< > $(BUILD_DIR)/$*.d
|
$(Q)$(CC) $(CPPFLAGS) -MT $@ -MM $< > $(BUILD_DIR)/$*.d
|
||||||
|
|
||||||
|
$(BUILD_DIR)/%-cuda.o: %.cu
|
||||||
|
$(info ===> COMPILE $@)
|
||||||
|
$(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
|
||||||
|
$(Q)$(CC) $(CPPFLAGS) -MT $@ -MM $< > $(BUILD_DIR)/$*.d
|
||||||
|
|
||||||
$(BUILD_DIR)/%.s: %.c
|
$(BUILD_DIR)/%.s: %.c
|
||||||
$(info ===> GENERATE ASM $@)
|
$(info ===> GENERATE ASM $@)
|
||||||
$(Q)$(CC) -S $(ASFLAGS) $(CPPFLAGS) $(CFLAGS) $< -o $@
|
$(Q)$(CC) -S $(ASFLAGS) $(CPPFLAGS) $(CFLAGS) $< -o $@
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
# Compiler tag (GCC/CLANG/ICC/ONEAPI)
|
# Compiler tag (GCC/CLANG/ICC/ONEAPI/NVCC)
|
||||||
TAG ?= ICC
|
TAG ?= ICC
|
||||||
# Instruction set (SSE/AVX/AVX2/AVX512)
|
# Instruction set (SSE/AVX/AVX2/AVX512)
|
||||||
ISA ?= AVX512
|
ISA ?= AVX512
|
||||||
|
16
include_NVCC.mk
Normal file
16
include_NVCC.mk
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
CC = nvcc
|
||||||
|
LINKER = $(CC)
|
||||||
|
|
||||||
|
ANSI_CFLAGS = -ansi
|
||||||
|
ANSI_CFLAGS += -std=c99
|
||||||
|
ANSI_CFLAGS += -pedantic
|
||||||
|
ANSI_CFLAGS += -Wextra
|
||||||
|
|
||||||
|
# CFLAGS = -O0 -g -std=c99 -fargument-noalias
|
||||||
|
#CFLAGS = -O3 -g -arch=sm_61 # -fopenmp
|
||||||
|
CFLAGS = -O3 -g # -fopenmp
|
||||||
|
ASFLAGS = -masm=intel
|
||||||
|
LFLAGS =
|
||||||
|
DEFINES = -D_GNU_SOURCE -DCUDA_TARGET -DNO_ZMM_INTRIN #-DLIKWID_PERFMON
|
||||||
|
INCLUDES = $(LIKWID_INC)
|
||||||
|
LIBS = -lm $(LIKWID_LIB) -lcuda -lcudart #-llikwid
|
@ -24,28 +24,24 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
|
#include <util.h>
|
||||||
|
|
||||||
void* allocate (int alignment, size_t bytesize)
|
void *allocate(int alignment, size_t bytesize) {
|
||||||
{
|
|
||||||
int errorCode;
|
int errorCode;
|
||||||
void* ptr;
|
void* ptr;
|
||||||
|
|
||||||
errorCode = posix_memalign(&ptr, alignment, bytesize);
|
errorCode = posix_memalign(&ptr, alignment, bytesize);
|
||||||
|
if(errorCode == EINVAL) {
|
||||||
if (errorCode) {
|
fprintf(stderr, "Error: Alignment parameter is not a power of two\n");
|
||||||
if (errorCode == EINVAL) {
|
exit(EXIT_FAILURE);
|
||||||
fprintf(stderr,
|
|
||||||
"Error: Alignment parameter is not a power of two\n");
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
if (errorCode == ENOMEM) {
|
|
||||||
fprintf(stderr,
|
|
||||||
"Error: Insufficient memory to fulfill the request\n");
|
|
||||||
exit(EXIT_FAILURE);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ptr == NULL) {
|
if(errorCode == ENOMEM) {
|
||||||
|
fprintf(stderr, "Error: Insufficient memory to fulfill the request\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(ptr == NULL) {
|
||||||
fprintf(stderr, "Error: posix_memalign failed!\n");
|
fprintf(stderr, "Error: posix_memalign failed!\n");
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
@ -53,13 +49,8 @@ void* allocate (int alignment, size_t bytesize)
|
|||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
void* reallocate (
|
void *reallocate(void* ptr, int alignment, size_t newBytesize, size_t oldBytesize) {
|
||||||
void* ptr,
|
void *newarray = allocate(alignment, newBytesize);
|
||||||
int alignment,
|
|
||||||
size_t newBytesize,
|
|
||||||
size_t oldBytesize)
|
|
||||||
{
|
|
||||||
void* newarray = allocate(alignment, newBytesize);
|
|
||||||
|
|
||||||
if(ptr != NULL) {
|
if(ptr != NULL) {
|
||||||
memcpy(newarray, ptr, oldBytesize);
|
memcpy(newarray, ptr, oldBytesize);
|
||||||
@ -68,3 +59,25 @@ void* reallocate (
|
|||||||
|
|
||||||
return newarray;
|
return newarray;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifndef CUDA_TARGET
|
||||||
|
void *allocate_gpu(int alignment, size_t bytesize) { return NULL; }
|
||||||
|
void *reallocate_gpu(void *ptr, int alignment, size_t newBytesize, size_t oldBytesize) { return NULL; }
|
||||||
|
#else
|
||||||
|
void *allocate_gpu(int alignment, size_t bytesize) {
|
||||||
|
void *ptr;
|
||||||
|
checkCUDAError("allocate_gpu", cudaMallocHost((void **) &ptr, bytesize));
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Data is not preserved
|
||||||
|
void *reallocate_gpu(void *ptr, int alignment, size_t newBytesize, size_t oldBytesize) {
|
||||||
|
void *newarray = allocate_gpu(alignment, newBytesize);
|
||||||
|
|
||||||
|
if(ptr != NULL) {
|
||||||
|
cudaFreeHost(ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
return newarray;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
@ -31,6 +31,10 @@
|
|||||||
#include <allocate.h>
|
#include <allocate.h>
|
||||||
#include <util.h>
|
#include <util.h>
|
||||||
|
|
||||||
|
#ifdef CUDA_TARGET
|
||||||
|
#include <cuda_atom.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#define DELTA 20000
|
#define DELTA 20000
|
||||||
|
|
||||||
#ifndef MAXLINE
|
#ifndef MAXLINE
|
||||||
@ -67,6 +71,14 @@ void createAtom(Atom *atom, Parameter *param) {
|
|||||||
atom->Natoms = 4 * param->nx * param->ny * param->nz;
|
atom->Natoms = 4 * param->nx * param->ny * param->nz;
|
||||||
atom->Nlocal = 0;
|
atom->Nlocal = 0;
|
||||||
atom->ntypes = param->ntypes;
|
atom->ntypes = param->ntypes;
|
||||||
|
|
||||||
|
#ifdef CUDA_TARGET
|
||||||
|
checkCUDAError( "atom->epsilon cudaMallocHost", cudaMallocHost((void**)&(atom->epsilon), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||||
|
checkCUDAError( "atom->sigma6 cudaMallocHost", cudaMallocHost((void**)&(atom->sigma6), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||||
|
checkCUDAError( "atom->cutforcesq cudaMallocHost", cudaMallocHost((void**)&(atom->cutforcesq), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||||
|
checkCUDAError( "atom->cutneighsq cudaMallocHost", cudaMallocHost((void**)&(atom->cutneighsq), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||||
|
#endif
|
||||||
|
|
||||||
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||||
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||||
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||||
|
76
lammps/cuda/atom.cu
Normal file
76
lammps/cuda/atom.cu
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
/*
|
||||||
|
* =======================================================================================
|
||||||
|
*
|
||||||
|
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
|
||||||
|
* Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
|
||||||
|
*
|
||||||
|
* This file is part of MD-Bench.
|
||||||
|
*
|
||||||
|
* MD-Bench is free software: you can redistribute it and/or modify it
|
||||||
|
* under the terms of the GNU Lesser General Public License as published
|
||||||
|
* by the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||||
|
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||||
|
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
|
||||||
|
* details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public License along
|
||||||
|
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
* =======================================================================================
|
||||||
|
*/
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
//---
|
||||||
|
#include <allocate.h>
|
||||||
|
#include <atom.h>
|
||||||
|
#include <cuda_atom.h>
|
||||||
|
#include <neighbor.h>
|
||||||
|
|
||||||
|
void initCuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) {
|
||||||
|
c_atom->Natoms = atom->Natoms;
|
||||||
|
c_atom->Nlocal = atom->Nlocal;
|
||||||
|
c_atom->Nghost = atom->Nghost;
|
||||||
|
c_atom->Nmax = atom->Nmax;
|
||||||
|
c_atom->ntypes = atom->ntypes;
|
||||||
|
|
||||||
|
c_atom->border_map = NULL;
|
||||||
|
|
||||||
|
const int Nlocal = atom->Nlocal;
|
||||||
|
|
||||||
|
checkCUDAError( "c_atom->x malloc", cudaMalloc((void**)&(c_atom->x), sizeof(MD_FLOAT) * atom->Nmax * 3) );
|
||||||
|
checkCUDAError( "c_atom->x memcpy", cudaMemcpy(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) );
|
||||||
|
|
||||||
|
checkCUDAError( "c_atom->fx malloc", cudaMalloc((void**)&(c_atom->fx), sizeof(MD_FLOAT) * Nlocal * 3) );
|
||||||
|
|
||||||
|
checkCUDAError( "c_atom->vx malloc", cudaMalloc((void**)&(c_atom->vx), sizeof(MD_FLOAT) * Nlocal * 3) );
|
||||||
|
checkCUDAError( "c_atom->vx memcpy", cudaMemcpy(c_atom->vx, atom->vx, sizeof(MD_FLOAT) * Nlocal * 3, cudaMemcpyHostToDevice) );
|
||||||
|
|
||||||
|
checkCUDAError( "c_atom->type malloc", cudaMalloc((void**)&(c_atom->type), sizeof(int) * atom->Nmax) );
|
||||||
|
checkCUDAError( "c_atom->epsilon malloc", cudaMalloc((void**)&(c_atom->epsilon), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
|
||||||
|
checkCUDAError( "c_atom->sigma6 malloc", cudaMalloc((void**)&(c_atom->sigma6), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
|
||||||
|
checkCUDAError( "c_atom->cutforcesq malloc", cudaMalloc((void**)&(c_atom->cutforcesq), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
|
||||||
|
|
||||||
|
checkCUDAError( "c_neighbor->neighbors malloc", cudaMalloc((void**)&c_neighbor->neighbors, sizeof(int) * Nlocal * neighbor->maxneighs) );
|
||||||
|
checkCUDAError( "c_neighbor->numneigh malloc", cudaMalloc((void**)&c_neighbor->numneigh, sizeof(int) * Nlocal) );
|
||||||
|
|
||||||
|
checkCUDAError( "c_atom->type memcpy", cudaMemcpy(c_atom->type, atom->type, sizeof(int) * atom->Nmax, cudaMemcpyHostToDevice) );
|
||||||
|
checkCUDAError( "c_atom->sigma6 memcpy", cudaMemcpy(c_atom->sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
|
||||||
|
checkCUDAError( "c_atom->epsilon memcpy", cudaMemcpy(c_atom->epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
|
||||||
|
|
||||||
|
checkCUDAError( "c_atom->cutforcesq memcpy", cudaMemcpy(c_atom->cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
|
||||||
|
}
|
||||||
|
|
||||||
|
void checkCUDAError(const char *msg, cudaError_t err) {
|
||||||
|
if (err != cudaSuccess) {
|
||||||
|
//print a human readable error message
|
||||||
|
printf("[CUDA ERROR %s]: %s\r\n", msg, cudaGetErrorString(err));
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
202
lammps/cuda/force.cu
Normal file
202
lammps/cuda/force.cu
Normal file
@ -0,0 +1,202 @@
|
|||||||
|
/*
|
||||||
|
* =======================================================================================
|
||||||
|
*
|
||||||
|
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
|
||||||
|
* Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
|
||||||
|
*
|
||||||
|
* This file is part of MD-Bench.
|
||||||
|
*
|
||||||
|
* MD-Bench is free software: you can redistribute it and/or modify it
|
||||||
|
* under the terms of the GNU Lesser General Public License as published
|
||||||
|
* by the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||||
|
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||||
|
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
|
||||||
|
* details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public License along
|
||||||
|
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
* =======================================================================================
|
||||||
|
*/
|
||||||
|
#include <math.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stddef.h>
|
||||||
|
//---
|
||||||
|
#include <cuda_profiler_api.h>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
#include <device_launch_parameters.h>
|
||||||
|
//---
|
||||||
|
#include <likwid-marker.h>
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
|
||||||
|
#include <atom.h>
|
||||||
|
#include <cuda_atom.h>
|
||||||
|
#include <allocate.h>
|
||||||
|
#include <neighbor.h>
|
||||||
|
#include <parameter.h>
|
||||||
|
#include <timing.h>
|
||||||
|
#include <util.h>
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// cuda kernel
|
||||||
|
__global__ void calc_force(Atom a, MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon, int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh) {
|
||||||
|
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
if(i >= Nlocal) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
Atom *atom = &a;
|
||||||
|
|
||||||
|
const int numneighs = neigh_numneigh[i];
|
||||||
|
|
||||||
|
MD_FLOAT xtmp = atom_x(i);
|
||||||
|
MD_FLOAT ytmp = atom_y(i);
|
||||||
|
MD_FLOAT ztmp = atom_z(i);
|
||||||
|
|
||||||
|
MD_FLOAT fix = 0;
|
||||||
|
MD_FLOAT fiy = 0;
|
||||||
|
MD_FLOAT fiz = 0;
|
||||||
|
|
||||||
|
for(int k = 0; k < numneighs; k++) {
|
||||||
|
int j = neigh_neighbors[atom->Nlocal * k + i];
|
||||||
|
MD_FLOAT delx = xtmp - atom_x(j);
|
||||||
|
MD_FLOAT dely = ytmp - atom_y(j);
|
||||||
|
MD_FLOAT delz = ztmp - atom_z(j);
|
||||||
|
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||||
|
|
||||||
|
#ifdef EXPLICIT_TYPES
|
||||||
|
const int type_j = atom->type[j];
|
||||||
|
const int type_ij = type_i * atom->ntypes + type_j;
|
||||||
|
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
|
||||||
|
const MD_FLOAT sigma6 = atom->sigma6[type_ij];
|
||||||
|
const MD_FLOAT epsilon = atom->epsilon[type_ij];
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if(rsq < cutforcesq) {
|
||||||
|
MD_FLOAT sr2 = 1.0 / rsq;
|
||||||
|
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
|
||||||
|
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
|
||||||
|
fix += delx * force;
|
||||||
|
fiy += dely * force;
|
||||||
|
fiz += delz * force;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
atom_fx(i) = fix;
|
||||||
|
atom_fy(i) = fiy;
|
||||||
|
atom_fz(i) = fiz;
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void kernel_initial_integrate(MD_FLOAT dtforce, MD_FLOAT dt, int Nlocal, Atom a) {
|
||||||
|
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
if( i >= Nlocal ) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
Atom *atom = &a;
|
||||||
|
|
||||||
|
atom_vx(i) += dtforce * atom_fx(i);
|
||||||
|
atom_vy(i) += dtforce * atom_fy(i);
|
||||||
|
atom_vz(i) += dtforce * atom_fz(i);
|
||||||
|
atom_x(i) = atom_x(i) + dt * atom_vx(i);
|
||||||
|
atom_y(i) = atom_y(i) + dt * atom_vy(i);
|
||||||
|
atom_z(i) = atom_z(i) + dt * atom_vz(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void kernel_final_integrate(MD_FLOAT dtforce, int Nlocal, Atom a) {
|
||||||
|
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
if( i >= Nlocal ) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
Atom *atom = &a;
|
||||||
|
|
||||||
|
atom_vx(i) += dtforce * atom_fx(i);
|
||||||
|
atom_vy(i) += dtforce * atom_fy(i);
|
||||||
|
atom_vz(i) += dtforce * atom_fz(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
|
||||||
|
void finalIntegrate_cuda(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom) {
|
||||||
|
const int Nlocal = atom->Nlocal;
|
||||||
|
const int num_threads_per_block = get_num_threads();
|
||||||
|
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
||||||
|
|
||||||
|
kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, *c_atom);
|
||||||
|
|
||||||
|
checkCUDAError( "PeekAtLastError FinalIntegrate", cudaPeekAtLastError() );
|
||||||
|
checkCUDAError( "DeviceSync FinalIntegrate", cudaDeviceSynchronize() );
|
||||||
|
|
||||||
|
if(doReneighbour) {
|
||||||
|
checkCUDAError( "FinalIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom->vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void initialIntegrate_cuda(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom) {
|
||||||
|
const int Nlocal = atom->Nlocal;
|
||||||
|
const int num_threads_per_block = get_num_threads();
|
||||||
|
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
||||||
|
|
||||||
|
kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, *c_atom);
|
||||||
|
|
||||||
|
checkCUDAError( "PeekAtLastError InitialIntegrate", cudaPeekAtLastError() );
|
||||||
|
checkCUDAError( "DeviceSync InitialIntegrate", cudaDeviceSynchronize() );
|
||||||
|
|
||||||
|
if(doReneighbour) {
|
||||||
|
checkCUDAError( "InitialIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom->vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
double computeForceLJFullNeigh_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) {
|
||||||
|
const int num_threads_per_block = get_num_threads();
|
||||||
|
int Nlocal = atom->Nlocal;
|
||||||
|
#ifndef EXPLICIT_TYPES
|
||||||
|
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||||
|
MD_FLOAT sigma6 = param->sigma6;
|
||||||
|
MD_FLOAT epsilon = param->epsilon;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
int nDevices;
|
||||||
|
cudaGetDeviceCount(&nDevices);
|
||||||
|
size_t free, total;
|
||||||
|
for(int i = 0; i < nDevices; ++i) {
|
||||||
|
cudaMemGetInfo( &free, &total );
|
||||||
|
cudaDeviceProp prop;
|
||||||
|
cudaGetDeviceProperties(&prop, i);
|
||||||
|
printf("DEVICE %d/%d NAME: %s\r\n with %ld MB/%ld MB memory used", i + 1, nDevices, prop.name, free / 1024 / 1024, total / 1024 / 1024);
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
// HINT: Run with cuda-memcheck ./MDBench-NVCC in case of error
|
||||||
|
|
||||||
|
// checkCUDAError( "c_atom->fx memset", cudaMemset(c_atom->fx, 0, sizeof(MD_FLOAT) * Nlocal * 3) );
|
||||||
|
|
||||||
|
cudaProfilerStart();
|
||||||
|
|
||||||
|
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
||||||
|
|
||||||
|
double S = getTimeStamp();
|
||||||
|
LIKWID_MARKER_START("force");
|
||||||
|
|
||||||
|
calc_force <<< num_blocks, num_threads_per_block >>> (*c_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, c_neighbor->neighbors, c_neighbor->numneigh);
|
||||||
|
|
||||||
|
checkCUDAError( "PeekAtLastError ComputeForce", cudaPeekAtLastError() );
|
||||||
|
checkCUDAError( "DeviceSync ComputeForce", cudaDeviceSynchronize() );
|
||||||
|
|
||||||
|
cudaProfilerStop();
|
||||||
|
|
||||||
|
LIKWID_MARKER_STOP("force");
|
||||||
|
double E = getTimeStamp();
|
||||||
|
|
||||||
|
return E-S;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
329
lammps/cuda/neighbor.cu
Normal file
329
lammps/cuda/neighbor.cu
Normal file
@ -0,0 +1,329 @@
|
|||||||
|
/*
|
||||||
|
* =======================================================================================
|
||||||
|
*
|
||||||
|
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
|
||||||
|
* Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
|
||||||
|
*
|
||||||
|
* This file is part of MD-Bench.
|
||||||
|
*
|
||||||
|
* MD-Bench is free software: you can redistribute it and/or modify it
|
||||||
|
* under the terms of the GNU Lesser General Public License as published
|
||||||
|
* by the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||||
|
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||||
|
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
|
||||||
|
* details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public License along
|
||||||
|
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
* =======================================================================================
|
||||||
|
*/
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <math.h>
|
||||||
|
#include <cuda_profiler_api.h>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
#include <device_launch_parameters.h>
|
||||||
|
//---
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
|
||||||
|
#include <atom.h>
|
||||||
|
#include <cuda_atom.h>
|
||||||
|
#include <parameter.h>
|
||||||
|
#include <neighbor.h>
|
||||||
|
#include <util.h>
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
static MD_FLOAT xprd, yprd, zprd;
|
||||||
|
static MD_FLOAT bininvx, bininvy, bininvz;
|
||||||
|
static int mbinxlo, mbinylo, mbinzlo;
|
||||||
|
static int nbinx, nbiny, nbinz;
|
||||||
|
static int mbinx, mbiny, mbinz; // n bins in x, y, z
|
||||||
|
static int mbins; //total number of bins
|
||||||
|
static int atoms_per_bin; // max atoms per bin
|
||||||
|
static MD_FLOAT cutneighsq; // neighbor cutoff squared
|
||||||
|
static int nmax;
|
||||||
|
static int nstencil; // # of bins in stencil
|
||||||
|
static int* stencil; // stencil list of bin offsets
|
||||||
|
static int* c_stencil = NULL;
|
||||||
|
static int* c_resize_needed = NULL;
|
||||||
|
static int* c_new_maxneighs = NULL;
|
||||||
|
static Binning c_binning {
|
||||||
|
.bincount = NULL,
|
||||||
|
.bins = NULL,
|
||||||
|
.mbins = 0,
|
||||||
|
.atoms_per_bin = 0
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
__device__ int coord2bin_device(MD_FLOAT xin, MD_FLOAT yin, MD_FLOAT zin, Neighbor_params np) {
|
||||||
|
int ix, iy, iz;
|
||||||
|
|
||||||
|
if(xin >= np.xprd) {
|
||||||
|
ix = (int)((xin - np.xprd) * np.bininvx) + np.nbinx - np.mbinxlo;
|
||||||
|
} else if(xin >= 0.0) {
|
||||||
|
ix = (int)(xin * np.bininvx) - np.mbinxlo;
|
||||||
|
} else {
|
||||||
|
ix = (int)(xin * np.bininvx) - np.mbinxlo - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(yin >= np.yprd) {
|
||||||
|
iy = (int)((yin - np.yprd) * np.bininvy) + np.nbiny - np.mbinylo;
|
||||||
|
} else if(yin >= 0.0) {
|
||||||
|
iy = (int)(yin * np.bininvy) - np.mbinylo;
|
||||||
|
} else {
|
||||||
|
iy = (int)(yin * np.bininvy) - np.mbinylo - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(zin >= np.zprd) {
|
||||||
|
iz = (int)((zin - np.zprd) * np.bininvz) + np.nbinz - np.mbinzlo;
|
||||||
|
} else if(zin >= 0.0) {
|
||||||
|
iz = (int)(zin * np.bininvz) - np.mbinzlo;
|
||||||
|
} else {
|
||||||
|
iz = (int)(zin * np.bininvz) - np.mbinzlo - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return (iz * np.mbiny * np.mbinx + iy * np.mbinx + ix + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* sorts the contents of a bin to make it comparable to the CPU version */
|
||||||
|
/* uses bubble sort since atoms per bin should be relatively small and can be done in situ */
|
||||||
|
__global__ void sort_bin_contents_kernel(int* bincount, int* bins, int mbins, int atoms_per_bin){
|
||||||
|
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
if(i >= mbins) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
int atoms_in_bin = bincount[i];
|
||||||
|
int *bin_ptr = &bins[i * atoms_per_bin];
|
||||||
|
int sorted;
|
||||||
|
do {
|
||||||
|
sorted = 1;
|
||||||
|
int tmp;
|
||||||
|
for(int index = 0; index < atoms_in_bin - 1; index++){
|
||||||
|
if (bin_ptr[index] > bin_ptr[index + 1]){
|
||||||
|
tmp = bin_ptr[index];
|
||||||
|
bin_ptr[index] = bin_ptr[index + 1];
|
||||||
|
bin_ptr[index + 1] = tmp;
|
||||||
|
sorted = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} while (!sorted);
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void binatoms_kernel(Atom a, int* bincount, int* bins, int atoms_per_bin, Neighbor_params np, int *resize_needed){
|
||||||
|
Atom* atom = &a;
|
||||||
|
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
int nall = atom->Nlocal + atom->Nghost;
|
||||||
|
if(i >= nall){
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
MD_FLOAT x = atom_x(i);
|
||||||
|
MD_FLOAT y = atom_y(i);
|
||||||
|
MD_FLOAT z = atom_z(i);
|
||||||
|
int ibin = coord2bin_device(x, y, z, np);
|
||||||
|
|
||||||
|
int ac = atomicAdd(&bincount[ibin], 1);
|
||||||
|
|
||||||
|
if(ac < atoms_per_bin){
|
||||||
|
bins[ibin * atoms_per_bin + ac] = i;
|
||||||
|
} else {
|
||||||
|
atomicMax(resize_needed, ac);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np, int nstencil, int* stencil,
|
||||||
|
int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq){
|
||||||
|
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
const int Nlocal = a.Nlocal;
|
||||||
|
if( i >= Nlocal ) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
Atom *atom = &a;
|
||||||
|
Neighbor *neighbor = &neigh;
|
||||||
|
|
||||||
|
int* neighptr = &(neighbor->neighbors[i]);
|
||||||
|
int n = 0;
|
||||||
|
MD_FLOAT xtmp = atom_x(i);
|
||||||
|
MD_FLOAT ytmp = atom_y(i);
|
||||||
|
MD_FLOAT ztmp = atom_z(i);
|
||||||
|
int ibin = coord2bin_device(xtmp, ytmp, ztmp, np);
|
||||||
|
#ifdef EXPLICIT_TYPES
|
||||||
|
int type_i = atom->type[i];
|
||||||
|
#endif
|
||||||
|
for(int k = 0; k < nstencil; k++) {
|
||||||
|
int jbin = ibin + stencil[k];
|
||||||
|
int* loc_bin = &bins[jbin * atoms_per_bin];
|
||||||
|
|
||||||
|
for(int m = 0; m < bincount[jbin]; m++) {
|
||||||
|
int j = loc_bin[m];
|
||||||
|
|
||||||
|
if ( j == i ){
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
MD_FLOAT delx = xtmp - atom_x(j);
|
||||||
|
MD_FLOAT dely = ytmp - atom_y(j);
|
||||||
|
MD_FLOAT delz = ztmp - atom_z(j);
|
||||||
|
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||||
|
|
||||||
|
#ifdef EXPLICIT_TYPES
|
||||||
|
int type_j = atom->type[j];
|
||||||
|
const MD_FLOAT cutoff = atom->cutneighsq[type_i * atom->ntypes + type_j];
|
||||||
|
#else
|
||||||
|
const MD_FLOAT cutoff = cutneighsq;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if( rsq <= cutoff ) {
|
||||||
|
int idx = atom->Nlocal * n;
|
||||||
|
neighptr[idx] = j;
|
||||||
|
n += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
neighbor->numneigh[i] = n;
|
||||||
|
|
||||||
|
if(n > neighbor->maxneighs) {
|
||||||
|
atomicMax(new_maxneighs, n);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void binatoms_cuda(Atom *c_atom, Binning *c_binning, int *c_resize_needed, Neighbor_params *np, const int threads_per_block) {
|
||||||
|
int nall = c_atom->Nlocal + c_atom->Nghost;
|
||||||
|
int resize = 1;
|
||||||
|
const int num_blocks = ceil((float) nall / (float) threads_per_block);
|
||||||
|
|
||||||
|
while(resize > 0) {
|
||||||
|
resize = 0;
|
||||||
|
checkCUDAError("binatoms_cuda c_binning->bincount memset", cudaMemset(c_binning->bincount, 0, c_binning->mbins * sizeof(int)));
|
||||||
|
checkCUDAError("binatoms_cuda c_resize_needed memset", cudaMemset(c_resize_needed, 0, sizeof(int)) );
|
||||||
|
|
||||||
|
/*binatoms_kernel(Atom a, int* bincount, int* bins, int c_binning->atoms_per_bin, Neighbor_params np, int *resize_needed) */
|
||||||
|
binatoms_kernel<<<num_blocks, threads_per_block>>>(*c_atom, c_binning->bincount, c_binning->bins, c_binning->atoms_per_bin, *np, c_resize_needed);
|
||||||
|
|
||||||
|
checkCUDAError( "PeekAtLastError binatoms kernel", cudaPeekAtLastError() );
|
||||||
|
checkCUDAError( "DeviceSync binatoms kernel", cudaDeviceSynchronize() );
|
||||||
|
|
||||||
|
checkCUDAError("binatoms_cuda c_resize_needed memcpy back", cudaMemcpy(&resize, c_resize_needed, sizeof(int), cudaMemcpyDeviceToHost) );
|
||||||
|
|
||||||
|
if(resize) {
|
||||||
|
cudaFree(c_binning->bins);
|
||||||
|
c_binning->atoms_per_bin *= 2;
|
||||||
|
checkCUDAError("binatoms_cuda c_binning->bins resize malloc", cudaMalloc(&c_binning->bins, c_binning->mbins * c_binning->atoms_per_bin * sizeof(int)) );
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
atoms_per_bin = c_binning->atoms_per_bin;
|
||||||
|
const int sortBlocks = ceil((float)mbins / (float)threads_per_block);
|
||||||
|
/*void sort_bin_contents_kernel(int* bincount, int* bins, int mbins, int atoms_per_bin)*/
|
||||||
|
sort_bin_contents_kernel<<<sortBlocks, threads_per_block>>>(c_binning->bincount, c_binning->bins, c_binning->mbins, c_binning->atoms_per_bin);
|
||||||
|
checkCUDAError( "PeekAtLastError sort_bin_contents kernel", cudaPeekAtLastError() );
|
||||||
|
checkCUDAError( "DeviceSync sort_bin_contents kernel", cudaDeviceSynchronize() );
|
||||||
|
}
|
||||||
|
|
||||||
|
void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) {
|
||||||
|
const int num_threads_per_block = get_num_threads();
|
||||||
|
int nall = atom->Nlocal + atom->Nghost;
|
||||||
|
c_neighbor->maxneighs = neighbor->maxneighs;
|
||||||
|
|
||||||
|
cudaProfilerStart();
|
||||||
|
/* upload stencil */
|
||||||
|
// TODO move all of this initialization into its own method
|
||||||
|
if(c_stencil == NULL){
|
||||||
|
checkCUDAError( "buildNeighbor c_n_stencil malloc", cudaMalloc((void**)&c_stencil, nstencil * sizeof(int)) );
|
||||||
|
checkCUDAError( "buildNeighbor c_n_stencil memcpy", cudaMemcpy(c_stencil, stencil, nstencil * sizeof(int), cudaMemcpyHostToDevice ));
|
||||||
|
}
|
||||||
|
|
||||||
|
if(c_binning.mbins == 0){
|
||||||
|
c_binning.mbins = mbins;
|
||||||
|
c_binning.atoms_per_bin = atoms_per_bin;
|
||||||
|
checkCUDAError( "buildNeighbor c_binning->bincount malloc", cudaMalloc((void**)&(c_binning.bincount), c_binning.mbins * sizeof(int)) );
|
||||||
|
checkCUDAError( "buidlNeighbor c_binning->bins malloc", cudaMalloc((void**)&(c_binning.bins), c_binning.mbins * c_binning.atoms_per_bin * sizeof(int)) );
|
||||||
|
}
|
||||||
|
|
||||||
|
Neighbor_params np {
|
||||||
|
.xprd = xprd,
|
||||||
|
.yprd = yprd,
|
||||||
|
.zprd = zprd,
|
||||||
|
.bininvx = bininvx,
|
||||||
|
.bininvy = bininvy,
|
||||||
|
.bininvz = bininvz,
|
||||||
|
.mbinxlo = mbinxlo,
|
||||||
|
.mbinylo = mbinylo,
|
||||||
|
.mbinzlo = mbinzlo,
|
||||||
|
.nbinx = nbinx,
|
||||||
|
.nbiny = nbiny,
|
||||||
|
.nbinz = nbinz,
|
||||||
|
.mbinx = mbinx,
|
||||||
|
.mbiny = mbiny,
|
||||||
|
.mbinz = mbinz
|
||||||
|
};
|
||||||
|
|
||||||
|
if(c_resize_needed == NULL){
|
||||||
|
checkCUDAError("buildNeighbor c_resize_needed malloc", cudaMalloc((void**)&c_resize_needed, sizeof(int)) );
|
||||||
|
}
|
||||||
|
|
||||||
|
/* bin local & ghost atoms */
|
||||||
|
binatoms_cuda(c_atom, &c_binning, c_resize_needed, &np, num_threads_per_block);
|
||||||
|
if(c_new_maxneighs == NULL){
|
||||||
|
checkCUDAError("c_new_maxneighs malloc", cudaMalloc((void**)&c_new_maxneighs, sizeof(int) ));
|
||||||
|
}
|
||||||
|
|
||||||
|
int resize = 1;
|
||||||
|
|
||||||
|
/* extend c_neighbor arrays if necessary */
|
||||||
|
if(nall > nmax) {
|
||||||
|
nmax = nall;
|
||||||
|
if(c_neighbor->numneigh) cudaFree(c_neighbor->numneigh);
|
||||||
|
if(c_neighbor->neighbors) cudaFree(c_neighbor->neighbors);
|
||||||
|
checkCUDAError( "buildNeighbor c_numneigh malloc", cudaMalloc((void**)&(c_neighbor->numneigh), nmax * sizeof(int)) );
|
||||||
|
checkCUDAError( "buildNeighbor c_neighbors malloc", cudaMalloc((void**)&(c_neighbor->neighbors), nmax * c_neighbor->maxneighs * sizeof(int)) );
|
||||||
|
}
|
||||||
|
|
||||||
|
/* loop over each atom, storing neighbors */
|
||||||
|
while(resize) {
|
||||||
|
resize = 0;
|
||||||
|
|
||||||
|
checkCUDAError("c_new_maxneighs memset", cudaMemset(c_new_maxneighs, 0, sizeof(int) ));
|
||||||
|
|
||||||
|
// TODO call compute_neigborhood kernel here
|
||||||
|
const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
|
||||||
|
/*compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np, int nstencil, int* stencil,
|
||||||
|
int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs)
|
||||||
|
* */
|
||||||
|
compute_neighborhood<<<num_blocks, num_threads_per_block>>>(*c_atom, *c_neighbor,
|
||||||
|
np, nstencil, c_stencil,
|
||||||
|
c_binning.bins, c_binning.atoms_per_bin, c_binning.bincount,
|
||||||
|
c_new_maxneighs,
|
||||||
|
cutneighsq);
|
||||||
|
|
||||||
|
checkCUDAError( "PeekAtLastError ComputeNeighbor", cudaPeekAtLastError() );
|
||||||
|
checkCUDAError( "DeviceSync ComputeNeighbor", cudaDeviceSynchronize() );
|
||||||
|
|
||||||
|
// TODO copy the value of c_new_maxneighs back to host and check if it has been modified
|
||||||
|
int new_maxneighs;
|
||||||
|
checkCUDAError("c_new_maxneighs memcpy back", cudaMemcpy(&new_maxneighs, c_new_maxneighs, sizeof(int), cudaMemcpyDeviceToHost));
|
||||||
|
if (new_maxneighs > c_neighbor->maxneighs){
|
||||||
|
resize = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(resize) {
|
||||||
|
printf("RESIZE %d\n", c_neighbor->maxneighs);
|
||||||
|
c_neighbor->maxneighs = new_maxneighs * 1.2;
|
||||||
|
printf("NEW SIZE %d\n", c_neighbor->maxneighs);
|
||||||
|
cudaFree(c_neighbor->neighbors);
|
||||||
|
checkCUDAError("c_neighbor->neighbors resize malloc", cudaMalloc((void**)(&c_neighbor->neighbors), c_atom->Nmax * c_neighbor->maxneighs * sizeof(int)));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
neighbor->maxneighs = c_neighbor->maxneighs;
|
||||||
|
|
||||||
|
cudaProfilerStop();
|
||||||
|
}
|
151
lammps/cuda/pbc.cu
Normal file
151
lammps/cuda/pbc.cu
Normal file
@ -0,0 +1,151 @@
|
|||||||
|
/*
|
||||||
|
* =======================================================================================
|
||||||
|
*
|
||||||
|
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
|
||||||
|
* Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
|
||||||
|
*
|
||||||
|
* This file is part of MD-Bench.
|
||||||
|
*
|
||||||
|
* MD-Bench is free software: you can redistribute it and/or modify it
|
||||||
|
* under the terms of the GNU Lesser General Public License as published
|
||||||
|
* by the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||||
|
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||||
|
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
|
||||||
|
* details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public License along
|
||||||
|
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
* =======================================================================================
|
||||||
|
*/
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
//---
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
|
||||||
|
#include <allocate.h>
|
||||||
|
#include <atom.h>
|
||||||
|
#include <cuda_atom.h>
|
||||||
|
#include <pbc.h>
|
||||||
|
#include <util.h>
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
static int NmaxGhost;
|
||||||
|
static int *PBCx, *PBCy, *PBCz;
|
||||||
|
static int c_NmaxGhost = 0;
|
||||||
|
static int *c_PBCx = NULL, *c_PBCy = NULL, *c_PBCz = NULL;
|
||||||
|
|
||||||
|
|
||||||
|
__global__ void computeAtomsPbcUpdate(Atom a, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd){
|
||||||
|
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
Atom* atom = &a;
|
||||||
|
if( i >= atom->Nlocal ){
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (atom_x(i) < 0.0) {
|
||||||
|
atom_x(i) += xprd;
|
||||||
|
} else if (atom_x(i) >= xprd) {
|
||||||
|
atom_x(i) -= xprd;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (atom_y(i) < 0.0) {
|
||||||
|
atom_y(i) += yprd;
|
||||||
|
} else if (atom_y(i) >= yprd) {
|
||||||
|
atom_y(i) -= yprd;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (atom_z(i) < 0.0) {
|
||||||
|
atom_z(i) += zprd;
|
||||||
|
} else if (atom_z(i) >= zprd) {
|
||||||
|
atom_z(i) -= zprd;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void computePbcUpdate(Atom a, int* PBCx, int* PBCy, int* PBCz, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd){
|
||||||
|
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
const int Nghost = a.Nghost;
|
||||||
|
if( i >= Nghost ) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
Atom* atom = &a;
|
||||||
|
int *border_map = atom->border_map;
|
||||||
|
int nlocal = atom->Nlocal;
|
||||||
|
|
||||||
|
atom_x(nlocal + i) = atom_x(border_map[i]) + PBCx[i] * xprd;
|
||||||
|
atom_y(nlocal + i) = atom_y(border_map[i]) + PBCy[i] * yprd;
|
||||||
|
atom_z(nlocal + i) = atom_z(border_map[i]) + PBCz[i] * zprd;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* update coordinates of ghost atoms */
|
||||||
|
/* uses mapping created in setupPbc */
|
||||||
|
void updatePbc_cuda(Atom *atom, Atom *c_atom, Parameter *param, bool doReneighbor) {
|
||||||
|
const int num_threads_per_block = get_num_threads();
|
||||||
|
|
||||||
|
if (doReneighbor){
|
||||||
|
c_atom->Natoms = atom->Natoms;
|
||||||
|
c_atom->Nlocal = atom->Nlocal;
|
||||||
|
c_atom->Nghost = atom->Nghost;
|
||||||
|
c_atom->ntypes = atom->ntypes;
|
||||||
|
|
||||||
|
if (atom->Nmax > c_atom->Nmax){ // the number of ghost atoms has increased -> more space is needed
|
||||||
|
c_atom->Nmax = atom->Nmax;
|
||||||
|
if(c_atom->x != NULL){ cudaFree(c_atom->x); }
|
||||||
|
if(c_atom->type != NULL){ cudaFree(c_atom->type); }
|
||||||
|
checkCUDAError( "updatePbc c_atom->x malloc", cudaMalloc((void**)&(c_atom->x), sizeof(MD_FLOAT) * atom->Nmax * 3) );
|
||||||
|
checkCUDAError( "updatePbc c_atom->type malloc", cudaMalloc((void**)&(c_atom->type), sizeof(int) * atom->Nmax) );
|
||||||
|
}
|
||||||
|
// TODO if the sort is reactivated the atom->vx needs to be copied to GPU as well
|
||||||
|
checkCUDAError( "updatePbc c_atom->x memcpy", cudaMemcpy(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) );
|
||||||
|
checkCUDAError( "updatePbc c_atom->type memcpy", cudaMemcpy(c_atom->type, atom->type, sizeof(int) * atom->Nmax, cudaMemcpyHostToDevice) );
|
||||||
|
|
||||||
|
if(c_NmaxGhost < NmaxGhost){
|
||||||
|
c_NmaxGhost = NmaxGhost;
|
||||||
|
if(c_PBCx != NULL){ cudaFree(c_PBCx); }
|
||||||
|
if(c_PBCy != NULL){ cudaFree(c_PBCy); }
|
||||||
|
if(c_PBCz != NULL){ cudaFree(c_PBCz); }
|
||||||
|
if(c_atom->border_map != NULL){ cudaFree(c_atom->border_map); }
|
||||||
|
checkCUDAError( "updatePbc c_PBCx malloc", cudaMalloc((void**)&c_PBCx, NmaxGhost * sizeof(int)) );
|
||||||
|
checkCUDAError( "updatePbc c_PBCy malloc", cudaMalloc((void**)&c_PBCy, NmaxGhost * sizeof(int)) );
|
||||||
|
checkCUDAError( "updatePbc c_PBCz malloc", cudaMalloc((void**)&c_PBCz, NmaxGhost * sizeof(int)) );
|
||||||
|
checkCUDAError( "updatePbc c_atom->border_map malloc", cudaMalloc((void**)&(c_atom->border_map), NmaxGhost * sizeof(int)) );
|
||||||
|
}
|
||||||
|
checkCUDAError( "updatePbc c_PBCx memcpy", cudaMemcpy(c_PBCx, PBCx, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) );
|
||||||
|
checkCUDAError( "updatePbc c_PBCy memcpy", cudaMemcpy(c_PBCy, PBCy, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) );
|
||||||
|
checkCUDAError( "updatePbc c_PBCz memcpy", cudaMemcpy(c_PBCz, PBCz, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) );
|
||||||
|
checkCUDAError( "updatePbc c_atom->border_map memcpy", cudaMemcpy(c_atom->border_map, atom->border_map, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) );
|
||||||
|
}
|
||||||
|
|
||||||
|
MD_FLOAT xprd = param->xprd;
|
||||||
|
MD_FLOAT yprd = param->yprd;
|
||||||
|
MD_FLOAT zprd = param->zprd;
|
||||||
|
|
||||||
|
const int num_blocks = ceil((float)atom->Nghost / (float)num_threads_per_block);
|
||||||
|
|
||||||
|
/*__global__ void computePbcUpdate(Atom a, int* PBCx, int* PBCy, int* PBCz,
|
||||||
|
* MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd)
|
||||||
|
* */
|
||||||
|
computePbcUpdate<<<num_blocks, num_threads_per_block>>>(*c_atom, c_PBCx, c_PBCy, c_PBCz, xprd, yprd, zprd);
|
||||||
|
checkCUDAError( "PeekAtLastError UpdatePbc", cudaPeekAtLastError() );
|
||||||
|
checkCUDAError( "DeviceSync UpdatePbc", cudaDeviceSynchronize() );
|
||||||
|
}
|
||||||
|
|
||||||
|
void updateAtomsPbc_cuda(Atom* atom, Atom *c_atom, Parameter *param){
|
||||||
|
const int num_threads_per_block = get_num_threads();
|
||||||
|
MD_FLOAT xprd = param->xprd;
|
||||||
|
MD_FLOAT yprd = param->yprd;
|
||||||
|
MD_FLOAT zprd = param->zprd;
|
||||||
|
|
||||||
|
const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
|
||||||
|
/*void computeAtomsPbcUpdate(Atom a, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd)*/
|
||||||
|
computeAtomsPbcUpdate<<<num_blocks, num_threads_per_block>>>(*c_atom, xprd, yprd, zprd);
|
||||||
|
|
||||||
|
checkCUDAError( "PeekAtLastError UpdateAtomsPbc", cudaPeekAtLastError() );
|
||||||
|
checkCUDAError( "DeviceSync UpdateAtomsPbc", cudaDeviceSynchronize() );
|
||||||
|
|
||||||
|
checkCUDAError( "updateAtomsPbc position memcpy back", cudaMemcpy(atom->x, c_atom->x, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
|
||||||
|
}
|
@ -25,6 +25,29 @@
|
|||||||
#ifndef __ATOM_H_
|
#ifndef __ATOM_H_
|
||||||
#define __ATOM_H_
|
#define __ATOM_H_
|
||||||
|
|
||||||
|
#ifdef CUDA_TARGET
|
||||||
|
# define KERNEL_NAME "CUDA"
|
||||||
|
# define computeForceLJFullNeigh computeForceLJFullNeigh_cuda
|
||||||
|
# define initialIntegrate initialIntegrate_cuda
|
||||||
|
# define finalIntegrate finalIntegrate_cuda
|
||||||
|
# define buildNeighbor buildNeighbor_cuda
|
||||||
|
# define updatePbc updatePbc_cuda
|
||||||
|
# define updateAtomsPbc updateAtomsPbc_cuda
|
||||||
|
#else
|
||||||
|
# ifdef USE_SIMD_KERNEL
|
||||||
|
# define KERNEL_NAME "SIMD"
|
||||||
|
# define computeForceLJFullNeigh computeForceLJFullNeigh_simd
|
||||||
|
# else
|
||||||
|
# define KERNEL_NAME "plain-C"
|
||||||
|
# define computeForceLJFullNeigh computeForceLJFullNeigh_plain_c
|
||||||
|
# endif
|
||||||
|
# define initialIntegrate initialIntegrate_cpu
|
||||||
|
# define finalIntegrate finalIntegrate_cpu
|
||||||
|
# define buildNeighbor buildNeighbor_cpu
|
||||||
|
# define updatePbc updatePbc_cpu
|
||||||
|
# define updateAtomsPbc updateAtomsPbc_cpu
|
||||||
|
#endif
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int Natoms, Nlocal, Nghost, Nmax;
|
int Natoms, Nlocal, Nghost, Nmax;
|
||||||
MD_FLOAT *x, *y, *z;
|
MD_FLOAT *x, *y, *z;
|
||||||
|
10
lammps/includes/cuda_atom.h
Normal file
10
lammps/includes/cuda_atom.h
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
#include <cuda_runtime.h>
|
||||||
|
//---
|
||||||
|
#include <atom.h>
|
||||||
|
#include <neighbor.h>
|
||||||
|
|
||||||
|
#ifndef __CUDA_ATOM_H_
|
||||||
|
#define __CUDA_ATOM_H_
|
||||||
|
extern void initCuda(Atom*, Neighbor*, Atom*, Neighbor*);
|
||||||
|
extern void checkCUDAError(const char *msg, cudaError_t err);
|
||||||
|
#endif
|
48
lammps/includes/integrate.h
Normal file
48
lammps/includes/integrate.h
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
/*
|
||||||
|
* =======================================================================================
|
||||||
|
*
|
||||||
|
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
|
||||||
|
* Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
|
||||||
|
*
|
||||||
|
* This file is part of MD-Bench.
|
||||||
|
*
|
||||||
|
* MD-Bench is free software: you can redistribute it and/or modify it
|
||||||
|
* under the terms of the GNU Lesser General Public License as published
|
||||||
|
* by the Free Software Foundation, either version 3 of the License, or
|
||||||
|
* (at your option) any later version.
|
||||||
|
*
|
||||||
|
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||||
|
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||||
|
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
|
||||||
|
* details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Lesser General Public License along
|
||||||
|
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
* =======================================================================================
|
||||||
|
*/
|
||||||
|
#include <parameter.h>
|
||||||
|
#include <atom.h>
|
||||||
|
|
||||||
|
void initialIntegrate_cpu(bool reneigh, Parameter *param, Atom *atom, Atom *c_atom) {
|
||||||
|
for(int i = 0; i < atom->Nlocal; i++) {
|
||||||
|
atom_vx(i) += param->dtforce * atom_fx(i);
|
||||||
|
atom_vy(i) += param->dtforce * atom_fy(i);
|
||||||
|
atom_vz(i) += param->dtforce * atom_fz(i);
|
||||||
|
atom_x(i) = atom_x(i) + param->dt * atom_vx(i);
|
||||||
|
atom_y(i) = atom_y(i) + param->dt * atom_vy(i);
|
||||||
|
atom_z(i) = atom_z(i) + param->dt * atom_vz(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void finalIntegrate_cpu(bool reneigh, Parameter *param, Atom *atom, Atom *c_atom) {
|
||||||
|
for(int i = 0; i < atom->Nlocal; i++) {
|
||||||
|
atom_vx(i) += param->dtforce * atom_fx(i);
|
||||||
|
atom_vy(i) += param->dtforce * atom_fy(i);
|
||||||
|
atom_vz(i) += param->dtforce * atom_fz(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef CUDA_TARGET
|
||||||
|
void initialIntegrate_cuda(bool, Parameter*, Atom*, Atom*);
|
||||||
|
void finalIntegrate_cuda(bool, Parameter*, Atom*, Atom*);
|
||||||
|
#endif
|
@ -34,9 +34,29 @@ typedef struct {
|
|||||||
int* numneigh;
|
int* numneigh;
|
||||||
} Neighbor;
|
} Neighbor;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
MD_FLOAT xprd; MD_FLOAT yprd; MD_FLOAT zprd;
|
||||||
|
MD_FLOAT bininvx; MD_FLOAT bininvy; MD_FLOAT bininvz;
|
||||||
|
int mbinxlo; int mbinylo; int mbinzlo;
|
||||||
|
int nbinx; int nbiny; int nbinz;
|
||||||
|
int mbinx; int mbiny; int mbinz;
|
||||||
|
} Neighbor_params;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
int* bincount;
|
||||||
|
int* bins;
|
||||||
|
int mbins;
|
||||||
|
int atoms_per_bin;
|
||||||
|
} Binning;
|
||||||
|
|
||||||
extern void initNeighbor(Neighbor*, Parameter*);
|
extern void initNeighbor(Neighbor*, Parameter*);
|
||||||
extern void setupNeighbor(Parameter*);
|
extern void setupNeighbor(Parameter*);
|
||||||
extern void binatoms(Atom*);
|
extern void binatoms(Atom*);
|
||||||
extern void buildNeighbor(Atom*, Neighbor*);
|
extern void buildNeighbor_cpu(Atom*, Neighbor*, Atom*, Neighbor*);
|
||||||
extern void sortAtom(Atom*);
|
extern void sortAtom(Atom*);
|
||||||
|
|
||||||
|
#ifdef CUDA_TARGET
|
||||||
|
extern void buildNeighbor_cuda(Atom*, Neighbor*, Atom*, Neighbor*);
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -20,13 +20,21 @@
|
|||||||
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
|
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
|
||||||
* =======================================================================================
|
* =======================================================================================
|
||||||
*/
|
*/
|
||||||
|
#include <stdbool.h>
|
||||||
|
//---
|
||||||
#include <atom.h>
|
#include <atom.h>
|
||||||
#include <parameter.h>
|
#include <parameter.h>
|
||||||
|
|
||||||
#ifndef __PBC_H_
|
#ifndef __PBC_H_
|
||||||
#define __PBC_H_
|
#define __PBC_H_
|
||||||
extern void initPbc();
|
extern void initPbc();
|
||||||
extern void updatePbc(Atom*, Parameter*);
|
extern void updatePbc_cpu(Atom*, Atom*, Parameter*, bool);
|
||||||
extern void updateAtomsPbc(Atom*, Parameter*);
|
extern void updateAtomsPbc_cpu(Atom*, Atom*, Parameter*);
|
||||||
extern void setupPbc(Atom*, Parameter*);
|
extern void setupPbc(Atom*, Parameter*);
|
||||||
|
|
||||||
|
#ifdef CUDA_TARGET
|
||||||
|
extern void updatePbc_cuda(Atom*, Atom*, Parameter*, bool);
|
||||||
|
extern void updateAtomsPbc_cuda(Atom*, Atom*, Parameter*);
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -50,4 +50,6 @@ extern double myrandom(int*);
|
|||||||
extern void random_reset(int *seed, int ibase, double *coord);
|
extern void random_reset(int *seed, int ibase, double *coord);
|
||||||
extern int str2ff(const char *string);
|
extern int str2ff(const char *string);
|
||||||
extern const char* ff2str(int ff);
|
extern const char* ff2str(int ff);
|
||||||
|
extern int get_num_threads();
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -42,6 +42,7 @@
|
|||||||
#include <eam.h>
|
#include <eam.h>
|
||||||
#include <vtk.h>
|
#include <vtk.h>
|
||||||
#include <util.h>
|
#include <util.h>
|
||||||
|
#include <integrate.h>
|
||||||
|
|
||||||
#define HLINE "----------------------------------------------------------------------------\n"
|
#define HLINE "----------------------------------------------------------------------------\n"
|
||||||
|
|
||||||
@ -51,15 +52,12 @@ extern double computeForceLJHalfNeigh(Parameter*, Atom*, Neighbor*, Stats*);
|
|||||||
extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
|
extern double computeForceEam(Eam*, Parameter*, Atom*, Neighbor*, Stats*);
|
||||||
extern double computeForceDemFullNeigh(Parameter*, Atom*, Neighbor*, Stats*);
|
extern double computeForceDemFullNeigh(Parameter*, Atom*, Neighbor*, Stats*);
|
||||||
|
|
||||||
#ifdef USE_SIMD_KERNEL
|
#ifdef CUDA_TARGET
|
||||||
# define KERNEL_NAME "SIMD"
|
#include <cuda_atom.h>
|
||||||
# define computeForceLJFullNeigh computeForceLJFullNeigh_simd
|
extern double computeForceLJFullNeigh_cuda(Parameter*, Atom*, Neighbor*, Atom*, Neighbor*);
|
||||||
#else
|
|
||||||
# define KERNEL_NAME "plain-C"
|
|
||||||
# define computeForceLJFullNeigh computeForceLJFullNeigh_plain_c
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor, Stats *stats) {
|
||||||
if(param->force_field == FF_EAM) { initEam(eam, param); }
|
if(param->force_field == FF_EAM) { initEam(eam, param); }
|
||||||
double S, E;
|
double S, E;
|
||||||
param->lattice = pow((4.0 / param->rho), (1.0 / 3.0));
|
param->lattice = pow((4.0 / param->rho), (1.0 / 3.0));
|
||||||
@ -82,45 +80,29 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *
|
|||||||
setupThermo(param, atom->Natoms);
|
setupThermo(param, atom->Natoms);
|
||||||
if(param->input_file == NULL) { adjustThermo(param, atom); }
|
if(param->input_file == NULL) { adjustThermo(param, atom); }
|
||||||
setupPbc(atom, param);
|
setupPbc(atom, param);
|
||||||
updatePbc(atom, param);
|
#ifdef CUDA_TARGET
|
||||||
buildNeighbor(atom, neighbor);
|
initCuda(atom, neighbor, c_atom, c_neighbor);
|
||||||
|
#endif
|
||||||
|
updatePbc(atom, c_atom, param, true);
|
||||||
|
buildNeighbor(atom, neighbor, c_atom, c_neighbor);
|
||||||
E = getTimeStamp();
|
E = getTimeStamp();
|
||||||
return E-S;
|
return E-S;
|
||||||
}
|
}
|
||||||
|
|
||||||
double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
|
double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) {
|
||||||
double S, E;
|
double S, E;
|
||||||
S = getTimeStamp();
|
S = getTimeStamp();
|
||||||
LIKWID_MARKER_START("reneighbour");
|
LIKWID_MARKER_START("reneighbour");
|
||||||
updateAtomsPbc(atom, param);
|
updateAtomsPbc(atom, c_atom, param);
|
||||||
setupPbc(atom, param);
|
setupPbc(atom, param);
|
||||||
updatePbc(atom, param);
|
updatePbc(atom, c_atom, param, true);
|
||||||
//sortAtom(atom);
|
//sortAtom(atom);
|
||||||
buildNeighbor(atom, neighbor);
|
buildNeighbor(atom, neighbor, c_atom, c_neighbor);
|
||||||
LIKWID_MARKER_STOP("reneighbour");
|
LIKWID_MARKER_STOP("reneighbour");
|
||||||
E = getTimeStamp();
|
E = getTimeStamp();
|
||||||
return E-S;
|
return E-S;
|
||||||
}
|
}
|
||||||
|
|
||||||
void initialIntegrate(Parameter *param, Atom *atom) {
|
|
||||||
for(int i = 0; i < atom->Nlocal; i++) {
|
|
||||||
atom_vx(i) += param->dtforce * atom_fx(i);
|
|
||||||
atom_vy(i) += param->dtforce * atom_fy(i);
|
|
||||||
atom_vz(i) += param->dtforce * atom_fz(i);
|
|
||||||
atom_x(i) = atom_x(i) + param->dt * atom_vx(i);
|
|
||||||
atom_y(i) = atom_y(i) + param->dt * atom_vy(i);
|
|
||||||
atom_z(i) = atom_z(i) + param->dt * atom_vz(i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void finalIntegrate(Parameter *param, Atom *atom) {
|
|
||||||
for(int i = 0; i < atom->Nlocal; i++) {
|
|
||||||
atom_vx(i) += param->dtforce * atom_fx(i);
|
|
||||||
atom_vy(i) += param->dtforce * atom_fy(i);
|
|
||||||
atom_vz(i) += param->dtforce * atom_fz(i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void printAtomState(Atom *atom) {
|
void printAtomState(Atom *atom) {
|
||||||
printf("Atom counts: Natoms=%d Nlocal=%d Nghost=%d Nmax=%d\n", atom->Natoms, atom->Nlocal, atom->Nghost, atom->Nmax);
|
printf("Atom counts: Natoms=%d Nlocal=%d Nghost=%d Nmax=%d\n", atom->Natoms, atom->Nlocal, atom->Nghost, atom->Nmax);
|
||||||
// int nall = atom->Nlocal + atom->Nghost;
|
// int nall = atom->Nlocal + atom->Nghost;
|
||||||
@ -129,7 +111,7 @@ void printAtomState(Atom *atom) {
|
|||||||
// }
|
// }
|
||||||
}
|
}
|
||||||
|
|
||||||
double computeForce(Eam *eam, Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
|
double computeForce(Eam *eam, Parameter *param, Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor, Stats *stats) {
|
||||||
if(param->force_field == FF_EAM) {
|
if(param->force_field == FF_EAM) {
|
||||||
return computeForceEam(eam, param, atom, neighbor, stats);
|
return computeForceEam(eam, param, atom, neighbor, stats);
|
||||||
} else if(param->force_field == FF_DEM) {
|
} else if(param->force_field == FF_DEM) {
|
||||||
@ -145,14 +127,18 @@ double computeForce(Eam *eam, Parameter *param, Atom *atom, Neighbor *neighbor,
|
|||||||
return computeForceLJHalfNeigh(param, atom, neighbor, stats);
|
return computeForceLJHalfNeigh(param, atom, neighbor, stats);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef CUDA_TARGET
|
||||||
|
return computeForceLJFullNeigh(param, atom, neighbor, c_atom, c_neighbor);
|
||||||
|
#else
|
||||||
return computeForceLJFullNeigh(param, atom, neighbor, stats);
|
return computeForceLJFullNeigh(param, atom, neighbor, stats);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
double timer[NUMTIMER];
|
double timer[NUMTIMER];
|
||||||
Eam eam;
|
Eam eam;
|
||||||
Atom atom;
|
Atom atom, c_atom;
|
||||||
Neighbor neighbor;
|
Neighbor neighbor, c_neighbor;
|
||||||
Stats stats;
|
Stats stats;
|
||||||
Parameter param;
|
Parameter param;
|
||||||
|
|
||||||
@ -240,16 +226,16 @@ int main(int argc, char** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
param.cutneigh = param.cutforce + param.skin;
|
param.cutneigh = param.cutforce + param.skin;
|
||||||
setup(¶m, &eam, &atom, &neighbor, &stats);
|
setup(¶m, &eam, &atom, &neighbor, &c_atom, &c_neighbor, &stats);
|
||||||
printParameter(¶m);
|
printParameter(¶m);
|
||||||
|
|
||||||
printf("step\ttemp\t\tpressure\n");
|
printf("step\ttemp\t\tpressure\n");
|
||||||
computeThermo(0, ¶m, &atom);
|
computeThermo(0, ¶m, &atom);
|
||||||
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
|
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
|
||||||
traceAddresses(¶m, &atom, &neighbor, n + 1);
|
traceAddresses(¶m, &atom, &neighbor, n + 1);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
timer[FORCE] = computeForce(&eam, ¶m, &atom, &neighbor, &stats);
|
timer[FORCE] = computeForce(&eam, ¶m, &atom, &neighbor, &c_atom, &c_neighbor, &stats);
|
||||||
timer[NEIGH] = 0.0;
|
timer[NEIGH] = 0.0;
|
||||||
timer[TOTAL] = getTimeStamp();
|
timer[TOTAL] = getTimeStamp();
|
||||||
|
|
||||||
@ -258,21 +244,26 @@ int main(int argc, char** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
for(int n = 0; n < param.ntimes; n++) {
|
for(int n = 0; n < param.ntimes; n++) {
|
||||||
initialIntegrate(¶m, &atom);
|
bool reneigh = (n + 1) % param.reneigh_every == 0;
|
||||||
|
initialIntegrate(reneigh, ¶m, &atom, &c_atom);
|
||||||
if((n + 1) % param.reneigh_every) {
|
if((n + 1) % param.reneigh_every) {
|
||||||
updatePbc(&atom, ¶m);
|
updatePbc(&atom, &c_atom, ¶m, false);
|
||||||
} else {
|
} else {
|
||||||
timer[NEIGH] += reneighbour(¶m, &atom, &neighbor);
|
timer[NEIGH] += reneighbour(¶m, &atom, &neighbor, &c_atom, &c_neighbor);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
|
#if defined(MEM_TRACER) || defined(INDEX_TRACER)
|
||||||
traceAddresses(¶m, &atom, &neighbor, n + 1);
|
traceAddresses(¶m, &atom, &neighbor, n + 1);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
timer[FORCE] += computeForce(&eam, ¶m, &atom, &neighbor, &stats);
|
timer[FORCE] += computeForce(&eam, ¶m, &atom, &neighbor, &c_atom, &c_neighbor, &stats);
|
||||||
finalIntegrate(¶m, &atom);
|
finalIntegrate(reneigh, ¶m, &atom, &c_atom);
|
||||||
|
|
||||||
if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
|
if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
|
||||||
|
#ifdef CUDA_TARGET
|
||||||
|
checkCUDAError("computeThermo atom->x memcpy back", cudaMemcpy(atom.x, c_atom.x, atom.Nmax * sizeof(MD_FLOAT) * 3, cudaMemcpyDeviceToHost));
|
||||||
|
#endif
|
||||||
|
|
||||||
computeThermo(n + 1, ¶m, &atom);
|
computeThermo(n + 1, ¶m, &atom);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -287,11 +278,11 @@ int main(int argc, char** argv) {
|
|||||||
printf(HLINE);
|
printf(HLINE);
|
||||||
printf("Force field: %s\n", ff2str(param.force_field));
|
printf("Force field: %s\n", ff2str(param.force_field));
|
||||||
printf("Data layout for positions: %s\n", POS_DATA_LAYOUT);
|
printf("Data layout for positions: %s\n", POS_DATA_LAYOUT);
|
||||||
#if PRECISION == 1
|
#if PRECISION == 1
|
||||||
printf("Using single precision floating point.\n");
|
printf("Using single precision floating point.\n");
|
||||||
#else
|
#else
|
||||||
printf("Using double precision floating point.\n");
|
printf("Using double precision floating point.\n");
|
||||||
#endif
|
#endif
|
||||||
printf(HLINE);
|
printf(HLINE);
|
||||||
printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes);
|
printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes);
|
||||||
printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
|
printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
|
||||||
|
@ -169,7 +169,7 @@ void setupNeighbor(Parameter* param) {
|
|||||||
bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int));
|
bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int));
|
||||||
}
|
}
|
||||||
|
|
||||||
void buildNeighbor(Atom *atom, Neighbor *neighbor) {
|
void buildNeighbor_cpu(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) {
|
||||||
int nall = atom->Nlocal + atom->Nghost;
|
int nall = atom->Nlocal + atom->Nghost;
|
||||||
|
|
||||||
/* extend atom arrays if necessary */
|
/* extend atom arrays if necessary */
|
||||||
|
@ -20,9 +20,10 @@
|
|||||||
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
|
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
|
||||||
* =======================================================================================
|
* =======================================================================================
|
||||||
*/
|
*/
|
||||||
|
#include <stdbool.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
//---
|
||||||
#include <pbc.h>
|
#include <pbc.h>
|
||||||
#include <atom.h>
|
#include <atom.h>
|
||||||
#include <allocate.h>
|
#include <allocate.h>
|
||||||
@ -43,7 +44,7 @@ void initPbc(Atom* atom) {
|
|||||||
|
|
||||||
/* update coordinates of ghost atoms */
|
/* update coordinates of ghost atoms */
|
||||||
/* uses mapping created in setupPbc */
|
/* uses mapping created in setupPbc */
|
||||||
void updatePbc(Atom *atom, Parameter *param) {
|
void updatePbc_cpu(Atom *atom, Atom *c_atom, Parameter *param, bool doReneighbor) {
|
||||||
int *border_map = atom->border_map;
|
int *border_map = atom->border_map;
|
||||||
int nlocal = atom->Nlocal;
|
int nlocal = atom->Nlocal;
|
||||||
MD_FLOAT xprd = param->xprd;
|
MD_FLOAT xprd = param->xprd;
|
||||||
@ -59,7 +60,7 @@ void updatePbc(Atom *atom, Parameter *param) {
|
|||||||
|
|
||||||
/* relocate atoms that have left domain according
|
/* relocate atoms that have left domain according
|
||||||
* to periodic boundary conditions */
|
* to periodic boundary conditions */
|
||||||
void updateAtomsPbc(Atom *atom, Parameter *param) {
|
void updateAtomsPbc_cpu(Atom *atom, Atom *c_atom, Parameter *param) {
|
||||||
MD_FLOAT xprd = param->xprd;
|
MD_FLOAT xprd = param->xprd;
|
||||||
MD_FLOAT yprd = param->yprd;
|
MD_FLOAT yprd = param->yprd;
|
||||||
MD_FLOAT zprd = param->zprd;
|
MD_FLOAT zprd = param->zprd;
|
||||||
|
@ -32,8 +32,7 @@
|
|||||||
#define IR 2836
|
#define IR 2836
|
||||||
#define MASK 123459876
|
#define MASK 123459876
|
||||||
|
|
||||||
double myrandom(int* seed)
|
double myrandom(int* seed) {
|
||||||
{
|
|
||||||
int k= (*seed) / IQ;
|
int k= (*seed) / IQ;
|
||||||
double ans;
|
double ans;
|
||||||
|
|
||||||
@ -43,8 +42,7 @@ double myrandom(int* seed)
|
|||||||
return ans;
|
return ans;
|
||||||
}
|
}
|
||||||
|
|
||||||
void random_reset(int *seed, int ibase, double *coord)
|
void random_reset(int *seed, int ibase, double *coord) {
|
||||||
{
|
|
||||||
int i;
|
int i;
|
||||||
char *str = (char *) &ibase;
|
char *str = (char *) &ibase;
|
||||||
int n = sizeof(int);
|
int n = sizeof(int);
|
||||||
@ -80,18 +78,21 @@ void random_reset(int *seed, int ibase, double *coord)
|
|||||||
//save = 0;
|
//save = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
int str2ff(const char *string)
|
int str2ff(const char *string) {
|
||||||
{
|
|
||||||
if(strncmp(string, "lj", 2) == 0) return FF_LJ;
|
if(strncmp(string, "lj", 2) == 0) return FF_LJ;
|
||||||
if(strncmp(string, "eam", 3) == 0) return FF_EAM;
|
if(strncmp(string, "eam", 3) == 0) return FF_EAM;
|
||||||
if(strncmp(string, "dem", 3) == 0) return FF_DEM;
|
if(strncmp(string, "dem", 3) == 0) return FF_DEM;
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
const char* ff2str(int ff)
|
const char* ff2str(int ff) {
|
||||||
{
|
|
||||||
if(ff == FF_LJ) { return "lj"; }
|
if(ff == FF_LJ) { return "lj"; }
|
||||||
if(ff == FF_EAM) { return "eam"; }
|
if(ff == FF_EAM) { return "eam"; }
|
||||||
if(ff == FF_DEM) { return "dem"; }
|
if(ff == FF_DEM) { return "dem"; }
|
||||||
return "invalid";
|
return "invalid";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int get_num_threads() {
|
||||||
|
const char *num_threads_env = getenv("NUM_THREADS");
|
||||||
|
return (num_threads_env == NULL) ? 32 : atoi(num_threads_env);
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user