Compare commits
70 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
bc7b523979 | ||
|
eeba125a52 | ||
|
b32254b03f | ||
|
4dac820784 | ||
|
fe56c50efd | ||
|
7a61cbbabf | ||
|
176de0525b | ||
|
7bad7e84b6 | ||
|
fb304f240b | ||
|
5a6d1851ed | ||
|
f61f59ba3f | ||
|
d1c2249b55 | ||
|
c9db6e45fa | ||
|
0967e8f671 | ||
|
fa409c016c | ||
|
b65199308d | ||
|
71798f5ec5 | ||
|
4f0403d3ea | ||
|
fa86e44f90 | ||
|
7e8fd96fa4 | ||
|
463de5b1ed | ||
|
4a32a62a98 | ||
|
16e8b76012 | ||
|
60ed524dd8 | ||
|
45f83c7607 | ||
|
c49278cb21 | ||
|
757d4329f3 | ||
|
67f9c769ef | ||
|
b5b4d23c0c | ||
|
fea1e41daa | ||
|
f1998b7acc | ||
|
2fe3cd80a0 | ||
|
f4313f64e5 | ||
|
7f068a6959 | ||
|
62cfc22856 | ||
|
b024adaf5b | ||
|
696e6da01d | ||
|
b2a6574426 | ||
|
c4080e866e | ||
|
7b592b5fc7 | ||
|
4690542db5 | ||
|
8c131a7699 | ||
|
dc4d5f1a9c | ||
|
50007216ed | ||
|
72e4599acc | ||
|
8fa03733e9 | ||
|
bf1ae3d013 | ||
|
8009b54113 | ||
|
0ea0587442 | ||
|
134e3f4b78 | ||
|
c2bfa3ca3f | ||
|
2a099da5b7 | ||
|
7691b23d67 | ||
|
da90466f98 | ||
|
8f723c1299 | ||
|
0586ef150a | ||
|
2e5d973f7d | ||
|
e2fd1a0476 | ||
|
4105c844c6 | ||
|
1f5c9c4b23 | ||
|
29e115464b | ||
|
1a54314c8b | ||
|
280f595b7f | ||
|
3428974730 | ||
|
b54842f764 | ||
|
9730164e6f | ||
|
0f5fdd3708 | ||
|
3f7fb7f22a | ||
|
bfa6c581c3 | ||
|
fd886e77eb |
2
.gitignore
vendored
2
.gitignore
vendored
@ -27,6 +27,7 @@
|
||||
*.so
|
||||
*.so.*
|
||||
*.dylib
|
||||
.DS_Store
|
||||
|
||||
# Executables
|
||||
*.exe
|
||||
@ -52,6 +53,7 @@ Mkfile.old
|
||||
dkms.conf
|
||||
|
||||
# Build directories and executables
|
||||
.vscode/
|
||||
GCC/
|
||||
ICC/
|
||||
MDBench-GCC*
|
||||
|
8
Makefile
8
Makefile
@ -58,6 +58,9 @@ ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.
|
||||
OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s))
|
||||
OBJ = $(filter-out $(BUILD_DIR)/main% $(OVERWRITE),$(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c)))
|
||||
OBJ += $(patsubst $(ASM_DIR)/%.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*.s))
|
||||
OBJ += $(patsubst $(SRC_DIR)/%.cu, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.cu))
|
||||
|
||||
|
||||
CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(OPTIONS) $(INCLUDES)
|
||||
|
||||
# $(warning $(OBJ))
|
||||
@ -88,6 +91,11 @@ $(BUILD_DIR)/%.o: %.s
|
||||
$(info ===> ASSEMBLE $@)
|
||||
$(Q)$(AS) $< -o $@
|
||||
|
||||
$(BUILD_DIR)/%.o: %.cu
|
||||
$(info ===> COMPILE $@)
|
||||
$(Q)$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
|
||||
$(Q)$(CC) $(CPPFLAGS) -MT $@ -MM $< > $(BUILD_DIR)/$*.d
|
||||
|
||||
.PHONY: clean distclean tags info asm
|
||||
|
||||
clean:
|
||||
|
@ -1,5 +1,5 @@
|
||||
# Compiler tag (GCC/CLANG/ICC)
|
||||
TAG ?= CLANG
|
||||
# Compiler tag (GCC/CLANG/ICC/NVCC)
|
||||
TAG ?= NVCC
|
||||
# Enable likwid (true or false)
|
||||
ENABLE_LIKWID ?= false
|
||||
# SP or DP
|
||||
@ -22,7 +22,7 @@ INDEX_TRACER ?= false
|
||||
# Vector width (elements) for index and distance tracer
|
||||
VECTOR_WIDTH ?= 8
|
||||
# Compute statistics
|
||||
COMPUTE_STATS ?= true
|
||||
COMPUTE_STATS ?= false
|
||||
|
||||
#Feature options
|
||||
OPTIONS = -DALIGNMENT=64
|
||||
|
10
evaluate_cpu_openmpi_metrics.sh
Normal file
10
evaluate_cpu_openmpi_metrics.sh
Normal file
@ -0,0 +1,10 @@
|
||||
END=32
|
||||
for ((i=1;i<=END;i++)); do
|
||||
output=$(eval "likwid-mpirun -np 1 -t $i -m -g FLOPS_DP -omp gnu ./MDBench-GCC -n 50")
|
||||
echo -n "$i,"
|
||||
echo "$output" > "FLOPS_DP/thread_$i.txt"
|
||||
done
|
||||
|
||||
## likwid perf measurements on testfront1:
|
||||
# srun --nodes=1 --exclusive --nodelist=rome1 --time=00:30:00 --export=NONE -c 64 -C hwperf --pty /bin/bash -l
|
||||
# likwid-mpirun -np 1 -t 32 -m -g MEM -omp gnu -d ./MDBench-GCC
|
6
evaluate_cpu_runtime.sh
Normal file
6
evaluate_cpu_runtime.sh
Normal file
@ -0,0 +1,6 @@
|
||||
#!/bin/bash
|
||||
for i in $(seq 1 32); do
|
||||
echo "$i"
|
||||
export "OMP_NUM_THREADS=$i"
|
||||
./MDBench-GCC -n 50 | grep "Performance"
|
||||
done
|
5
evaluate_gpu_ncu_profiles_per_thread.sh
Normal file
5
evaluate_gpu_ncu_profiles_per_thread.sh
Normal file
@ -0,0 +1,5 @@
|
||||
END=32
|
||||
for ((i=16;i<=END;i++)); do
|
||||
export NUM_THREADS=$i
|
||||
$(eval "ncu --set full -o /home/hpc/rzku/ptfs410h/MD-Bench/log/MG/presentation_2/Resources/GPU/Metrics/threads_$i ./MDBench-NVCC -n 50")
|
||||
done
|
6
evaluate_gpu_perf_per_thread.sh
Normal file
6
evaluate_gpu_perf_per_thread.sh
Normal file
@ -0,0 +1,6 @@
|
||||
END=64
|
||||
for ((i=1;i<=END;i*=2)); do
|
||||
output=$(eval "NUM_THREADS=$i ./MDBench-NVCC -n 2000")
|
||||
echo -n "$i,"
|
||||
echo "$output" | grep 'atom updates per second' | sed 's/[^0-9.,]//g' | awk '{print $1"e6"}'
|
||||
done
|
16
include_NVCC.mk
Normal file
16
include_NVCC.mk
Normal file
@ -0,0 +1,16 @@
|
||||
CC = nvcc
|
||||
LINKER = $(CC)
|
||||
|
||||
ANSI_CFLAGS = -ansi
|
||||
ANSI_CFLAGS += -std=c99
|
||||
ANSI_CFLAGS += -pedantic
|
||||
ANSI_CFLAGS += -Wextra
|
||||
|
||||
# CFLAGS = -O0 -g -std=c99 -fargument-noalias
|
||||
#CFLAGS = -O3 -g -arch=sm_61 # -fopenmp
|
||||
CFLAGS = -O3 -g # -fopenmp
|
||||
ASFLAGS = -masm=intel
|
||||
LFLAGS =
|
||||
DEFINES = -D_GNU_SOURCE #-DLIKWID_PERFMON
|
||||
INCLUDES = $(LIKWID_INC)
|
||||
LIBS = -lm $(LIKWID_LIB) -lcuda -lcudart #-llikwid
|
@ -25,11 +25,29 @@
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
void checkCUDAError(const char *msg, cudaError_t err)
|
||||
{
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
//print a human readable error message
|
||||
printf("[CUDA ERROR %s]: %s\r\n", msg, cudaGetErrorString(err));
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void* allocate (int alignment, size_t bytesize)
|
||||
{
|
||||
int errorCode;
|
||||
void* ptr;
|
||||
|
||||
checkCUDAError( "allocate", cudaMallocHost((void**)&ptr, bytesize) );
|
||||
|
||||
return ptr;
|
||||
|
||||
/*
|
||||
errorCode = posix_memalign(&ptr, alignment, bytesize);
|
||||
|
||||
if (errorCode) {
|
||||
@ -51,6 +69,7 @@ void* allocate (int alignment, size_t bytesize)
|
||||
}
|
||||
|
||||
return ptr;
|
||||
*/
|
||||
}
|
||||
|
||||
void* reallocate (
|
||||
@ -59,11 +78,11 @@ void* reallocate (
|
||||
size_t newBytesize,
|
||||
size_t oldBytesize)
|
||||
{
|
||||
void* newarray = allocate(alignment, newBytesize);
|
||||
void* newarray = allocate(alignment, newBytesize);
|
||||
|
||||
if(ptr != NULL) {
|
||||
memcpy(newarray, ptr, oldBytesize);
|
||||
free(ptr);
|
||||
cudaFreeHost(ptr);
|
||||
}
|
||||
|
||||
return newarray;
|
@ -30,6 +30,9 @@
|
||||
#include <allocate.h>
|
||||
#include <util.h>
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <device_launch_parameters.h>
|
||||
|
||||
#define DELTA 20000
|
||||
|
||||
void initAtom(Atom *atom)
|
||||
@ -57,10 +60,10 @@ void createAtom(Atom *atom, Parameter *param)
|
||||
atom->Natoms = 4 * param->nx * param->ny * param->nz;
|
||||
atom->Nlocal = 0;
|
||||
atom->ntypes = param->ntypes;
|
||||
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
checkCUDAError( "atom->epsilon cudaMallocHost", cudaMallocHost((void**)&(atom->epsilon), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
checkCUDAError( "atom->sigma6 cudaMallocHost", cudaMallocHost((void**)&(atom->sigma6), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
checkCUDAError( "atom->cutforcesq cudaMallocHost", cudaMallocHost((void**)&(atom->cutforcesq), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
checkCUDAError( "atom->cutneighsq cudaMallocHost", cudaMallocHost((void**)&(atom->cutneighsq), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
|
||||
atom->epsilon[i] = param->epsilon;
|
||||
atom->sigma6[i] = param->sigma6;
|
||||
@ -134,9 +137,9 @@ void createAtom(Atom *atom, Parameter *param)
|
||||
atom_x(atom->Nlocal) = xtmp;
|
||||
atom_y(atom->Nlocal) = ytmp;
|
||||
atom_z(atom->Nlocal) = ztmp;
|
||||
atom->vx[atom->Nlocal] = vxtmp;
|
||||
atom->vy[atom->Nlocal] = vytmp;
|
||||
atom->vz[atom->Nlocal] = vztmp;
|
||||
atom_vx(atom->Nlocal) = vxtmp;
|
||||
atom_vy(atom->Nlocal) = vytmp;
|
||||
atom_vz(atom->Nlocal) = vztmp;
|
||||
atom->type[atom->Nlocal] = rand() % atom->ntypes;
|
||||
atom->Nlocal++;
|
||||
}
|
||||
@ -159,16 +162,24 @@ void growAtom(Atom *atom)
|
||||
|
||||
#ifdef AOS
|
||||
atom->x = (MD_FLOAT*) reallocate(atom->x, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
|
||||
|
||||
atom->fx = (MD_FLOAT*) reallocate(atom->fx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
|
||||
|
||||
atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT) * 3, nold * sizeof(MD_FLOAT) * 3);
|
||||
#else
|
||||
atom->x = (MD_FLOAT*) reallocate(atom->x, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
atom->y = (MD_FLOAT*) reallocate(atom->y, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
atom->z = (MD_FLOAT*) reallocate(atom->z, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
#endif
|
||||
atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
atom->vy = (MD_FLOAT*) reallocate(atom->vy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
atom->vz = (MD_FLOAT*) reallocate(atom->vz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
|
||||
atom->fx = (MD_FLOAT*) reallocate(atom->fx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
atom->fy = (MD_FLOAT*) reallocate(atom->fy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
atom->fz = (MD_FLOAT*) reallocate(atom->fz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
|
||||
atom->vx = (MD_FLOAT*) reallocate(atom->vx, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
atom->vy = (MD_FLOAT*) reallocate(atom->vy, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
atom->vz = (MD_FLOAT*) reallocate(atom->vz, ALIGNMENT, atom->Nmax * sizeof(MD_FLOAT), nold * sizeof(MD_FLOAT));
|
||||
|
||||
#endif
|
||||
|
||||
atom->type = (int *) reallocate(atom->type, ALIGNMENT, atom->Nmax * sizeof(int), nold * sizeof(int));
|
||||
}
|
212
lammps/force.cu
Normal file
212
lammps/force.cu
Normal file
@ -0,0 +1,212 @@
|
||||
/*
|
||||
* =======================================================================================
|
||||
*
|
||||
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
|
||||
* Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
|
||||
*
|
||||
* This file is part of MD-Bench.
|
||||
*
|
||||
* MD-Bench is free software: you can redistribute it and/or modify it
|
||||
* under the terms of the GNU Lesser General Public License as published
|
||||
* by the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
|
||||
* details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License along
|
||||
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
|
||||
* =======================================================================================
|
||||
*/
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stddef.h>
|
||||
#include <cuda_profiler_api.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <device_launch_parameters.h>
|
||||
|
||||
extern "C" {
|
||||
#include <likwid-marker.h>
|
||||
|
||||
#include <timing.h>
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <atom.h>
|
||||
#include <allocate.h>
|
||||
}
|
||||
|
||||
// cuda kernel
|
||||
__global__ void calc_force(
|
||||
Atom a,
|
||||
MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon,
|
||||
int Nlocal, int neigh_maxneighs, int *neigh_neighbors, int *neigh_numneigh) {
|
||||
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if( i >= Nlocal ) {
|
||||
return;
|
||||
}
|
||||
|
||||
Atom *atom = &a;
|
||||
|
||||
const int numneighs = neigh_numneigh[i];
|
||||
|
||||
MD_FLOAT xtmp = atom_x(i);
|
||||
MD_FLOAT ytmp = atom_y(i);
|
||||
MD_FLOAT ztmp = atom_z(i);
|
||||
|
||||
MD_FLOAT fix = 0;
|
||||
MD_FLOAT fiy = 0;
|
||||
MD_FLOAT fiz = 0;
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int j = neigh_neighbors[atom->Nlocal * k + i];
|
||||
MD_FLOAT delx = xtmp - atom_x(j);
|
||||
MD_FLOAT dely = ytmp - atom_y(j);
|
||||
MD_FLOAT delz = ztmp - atom_z(j);
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_j = atom->type[j];
|
||||
const int type_ij = type_i * atom->ntypes + type_j;
|
||||
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
|
||||
const MD_FLOAT sigma6 = atom->sigma6[type_ij];
|
||||
const MD_FLOAT epsilon = atom->epsilon[type_ij];
|
||||
#endif
|
||||
|
||||
if(rsq < cutforcesq) {
|
||||
MD_FLOAT sr2 = 1.0 / rsq;
|
||||
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
|
||||
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
|
||||
fix += delx * force;
|
||||
fiy += dely * force;
|
||||
fiz += delz * force;
|
||||
}
|
||||
}
|
||||
|
||||
atom_fx(i) = fix;
|
||||
atom_fy(i) = fiy;
|
||||
atom_fz(i) = fiz;
|
||||
}
|
||||
|
||||
__global__ void kernel_initial_integrate(MD_FLOAT dtforce, MD_FLOAT dt, int Nlocal, Atom a) {
|
||||
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if( i >= Nlocal ) {
|
||||
return;
|
||||
}
|
||||
|
||||
Atom *atom = &a;
|
||||
|
||||
atom_vx(i) += dtforce * atom_fx(i);
|
||||
atom_vy(i) += dtforce * atom_fy(i);
|
||||
atom_vz(i) += dtforce * atom_fz(i);
|
||||
atom_x(i) = atom_x(i) + dt * atom_vx(i);
|
||||
atom_y(i) = atom_y(i) + dt * atom_vy(i);
|
||||
atom_z(i) = atom_z(i) + dt * atom_vz(i);
|
||||
}
|
||||
|
||||
__global__ void kernel_final_integrate(MD_FLOAT dtforce, int Nlocal, Atom a) {
|
||||
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if( i >= Nlocal ) {
|
||||
return;
|
||||
}
|
||||
|
||||
Atom *atom = &a;
|
||||
|
||||
atom_vx(i) += dtforce * atom_fx(i);
|
||||
atom_vy(i) += dtforce * atom_fy(i);
|
||||
atom_vz(i) += dtforce * atom_fz(i);
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
|
||||
|
||||
|
||||
void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom, const int num_threads_per_block) {
|
||||
|
||||
const int Nlocal = atom->Nlocal;
|
||||
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
||||
|
||||
kernel_final_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, Nlocal, *c_atom);
|
||||
|
||||
checkCUDAError( "PeekAtLastError FinalIntegrate", cudaPeekAtLastError() );
|
||||
checkCUDAError( "DeviceSync FinalIntegrate", cudaDeviceSynchronize() );
|
||||
|
||||
if(doReneighbour) {
|
||||
checkCUDAError( "FinalIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom->vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
|
||||
}
|
||||
}
|
||||
|
||||
void cuda_initial_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom, const int num_threads_per_block) {
|
||||
|
||||
const int Nlocal = atom->Nlocal;
|
||||
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
||||
|
||||
kernel_initial_integrate <<< num_blocks, num_threads_per_block >>> (param->dtforce, param->dt, Nlocal, *c_atom);
|
||||
|
||||
checkCUDAError( "PeekAtLastError InitialIntegrate", cudaPeekAtLastError() );
|
||||
checkCUDAError( "DeviceSync InitialIntegrate", cudaDeviceSynchronize() );
|
||||
|
||||
if(doReneighbour) {
|
||||
checkCUDAError( "InitialIntegrate: velocity memcpy", cudaMemcpy(atom->vx, c_atom->vx, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
|
||||
}
|
||||
}
|
||||
|
||||
double computeForce(
|
||||
bool reneighbourHappenend,
|
||||
Parameter *param,
|
||||
Atom *atom,
|
||||
Neighbor *neighbor,
|
||||
Atom *c_atom,
|
||||
Neighbor *c_neighbor,
|
||||
int num_threads_per_block
|
||||
)
|
||||
{
|
||||
int Nlocal = atom->Nlocal;
|
||||
#ifndef EXPLICIT_TYPES
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
#endif
|
||||
|
||||
/*
|
||||
int nDevices;
|
||||
cudaGetDeviceCount(&nDevices);
|
||||
size_t free, total;
|
||||
for(int i = 0; i < nDevices; ++i) {
|
||||
cudaMemGetInfo( &free, &total );
|
||||
cudaDeviceProp prop;
|
||||
cudaGetDeviceProperties(&prop, i);
|
||||
printf("DEVICE %d/%d NAME: %s\r\n with %ld MB/%ld MB memory used", i + 1, nDevices, prop.name, free / 1024 / 1024, total / 1024 / 1024);
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
// HINT: Run with cuda-memcheck ./MDBench-NVCC in case of error
|
||||
|
||||
// checkCUDAError( "c_atom->fx memset", cudaMemset(c_atom->fx, 0, sizeof(MD_FLOAT) * Nlocal * 3) );
|
||||
|
||||
cudaProfilerStart();
|
||||
|
||||
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
||||
|
||||
double S = getTimeStamp();
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
calc_force <<< num_blocks, num_threads_per_block >>> (*c_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, c_neighbor->neighbors, c_neighbor->numneigh);
|
||||
|
||||
checkCUDAError( "PeekAtLastError ComputeForce", cudaPeekAtLastError() );
|
||||
checkCUDAError( "DeviceSync ComputeForce", cudaDeviceSynchronize() );
|
||||
|
||||
cudaProfilerStop();
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
double E = getTimeStamp();
|
||||
|
||||
return E-S;
|
||||
}
|
||||
}
|
@ -22,8 +22,12 @@
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#ifndef __ALLOCATE_H_
|
||||
#define __ALLOCATE_H_
|
||||
extern void* allocate (int alignment, size_t bytesize);
|
||||
extern void* reallocate (void* ptr, int alignment, size_t newBytesize, size_t oldBytesize);
|
||||
|
||||
extern void checkCUDAError(const char *msg, cudaError_t err);
|
||||
#endif
|
@ -45,14 +45,34 @@ extern void growAtom(Atom*);
|
||||
|
||||
#ifdef AOS
|
||||
#define POS_DATA_LAYOUT "AoS"
|
||||
|
||||
#define atom_x(i) atom->x[(i) * 3 + 0]
|
||||
#define atom_y(i) atom->x[(i) * 3 + 1]
|
||||
#define atom_z(i) atom->x[(i) * 3 + 2]
|
||||
|
||||
#define atom_fx(i) atom->fx[(i) * 3 + 0]
|
||||
#define atom_fy(i) atom->fx[(i) * 3 + 1]
|
||||
#define atom_fz(i) atom->fx[(i) * 3 + 2]
|
||||
|
||||
#define atom_vx(i) atom->vx[(i) * 3 + 0]
|
||||
#define atom_vy(i) atom->vx[(i) * 3 + 1]
|
||||
#define atom_vz(i) atom->vx[(i) * 3 + 2]
|
||||
|
||||
#else
|
||||
#define POS_DATA_LAYOUT "SoA"
|
||||
|
||||
#define atom_x(i) atom->x[i]
|
||||
#define atom_y(i) atom->y[i]
|
||||
#define atom_z(i) atom->z[i]
|
||||
|
||||
#define atom_fx(i) atom->fx[i]
|
||||
#define atom_fy(i) atom->fy[i]
|
||||
#define atom_fz(i) atom->fz[i]
|
||||
|
||||
#define atom_vx(i) atom->vx[i]
|
||||
#define atom_vy(i) atom->vy[i]
|
||||
#define atom_vz(i) atom->vz[i]
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
@ -33,9 +33,26 @@ typedef struct {
|
||||
int* numneigh;
|
||||
} Neighbor;
|
||||
|
||||
typedef struct {
|
||||
MD_FLOAT xprd; MD_FLOAT yprd; MD_FLOAT zprd;
|
||||
MD_FLOAT bininvx; MD_FLOAT bininvy; MD_FLOAT bininvz;
|
||||
int mbinxlo; int mbinylo; int mbinzlo;
|
||||
int nbinx; int nbiny; int nbinz;
|
||||
int mbinx; int mbiny; int mbinz;
|
||||
} Neighbor_params;
|
||||
|
||||
typedef struct {
|
||||
int* bincount;
|
||||
int* bins;
|
||||
int mbins;
|
||||
int atoms_per_bin;
|
||||
} Binning;
|
||||
|
||||
extern void initNeighbor(Neighbor*, Parameter*);
|
||||
extern void setupNeighbor();
|
||||
extern void binatoms(Atom*);
|
||||
extern void buildNeighbor(Atom*, Neighbor*);
|
||||
extern void sortAtom(Atom*);
|
||||
extern void binatoms_cuda(Atom*, Binning*, int*, Neighbor_params*, const int);
|
||||
extern void buildNeighbor_cuda(Atom*, Neighbor*, Atom*, Neighbor*, const int, double*);
|
||||
#endif
|
@ -25,8 +25,10 @@
|
||||
|
||||
#ifndef __PBC_H_
|
||||
#define __PBC_H_
|
||||
extern void initPbc();
|
||||
extern void initPbc(Atom*);
|
||||
extern void updatePbc(Atom*, Parameter*);
|
||||
extern void updatePbc_cuda(Atom*, Parameter*, Atom*, bool, const int);
|
||||
extern void updateAtomsPbc(Atom*, Parameter*);
|
||||
extern void updateAtomsPbc_cuda(Atom*, Parameter*, Atom*, const int);
|
||||
extern void setupPbc(Atom*, Parameter*);
|
||||
#endif
|
@ -5,6 +5,11 @@ typedef enum {
|
||||
TOTAL = 0,
|
||||
NEIGH,
|
||||
FORCE,
|
||||
NEIGH_UPDATE_ATOMS_PBC,
|
||||
NEIGH_SETUP_PBC,
|
||||
NEIGH_UPDATE_PBC,
|
||||
NEIGH_BINATOMS,
|
||||
NEIGH_BUILD_LISTS,
|
||||
NUMTIMER
|
||||
} timertype;
|
||||
|
@ -23,6 +23,7 @@
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdbool.h>
|
||||
#include <unistd.h>
|
||||
#include <limits.h>
|
||||
#include <math.h>
|
||||
@ -44,7 +45,14 @@
|
||||
|
||||
#define HLINE "----------------------------------------------------------------------------\n"
|
||||
|
||||
extern double computeForce(Parameter*, Atom*, Neighbor*);
|
||||
extern void cuda_final_integrate(bool doReneighbour, Parameter *param,
|
||||
Atom *atom, Atom *c_atom,
|
||||
const int num_threads_per_block);
|
||||
extern void cuda_initial_integrate(bool doReneighbour, Parameter *param,
|
||||
Atom *atom, Atom *c_atom,
|
||||
const int num_threads_per_block);
|
||||
|
||||
extern double computeForce(bool, Parameter*, Atom*, Neighbor*, Atom*, Neighbor*, const int);
|
||||
extern double computeForceTracing(Parameter*, Atom*, Neighbor*, Stats*, int, int);
|
||||
extern double computeForceEam(Eam* eam, Parameter*, Atom *atom, Neighbor *neighbor, Stats *stats, int first_exec, int timestep);
|
||||
|
||||
@ -72,12 +80,52 @@ void init(Parameter *param)
|
||||
param->proc_freq = 2.4;
|
||||
}
|
||||
|
||||
void initCudaAtom(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) {
|
||||
|
||||
c_atom->Natoms = atom->Natoms;
|
||||
c_atom->Nlocal = atom->Nlocal;
|
||||
c_atom->Nghost = atom->Nghost;
|
||||
c_atom->Nmax = atom->Nmax;
|
||||
c_atom->ntypes = atom->ntypes;
|
||||
|
||||
c_atom->border_map = NULL;
|
||||
|
||||
const int Nlocal = atom->Nlocal;
|
||||
|
||||
checkCUDAError( "c_atom->x malloc", cudaMalloc((void**)&(c_atom->x), sizeof(MD_FLOAT) * atom->Nmax * 3) );
|
||||
checkCUDAError( "c_atom->x memcpy", cudaMemcpy(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) );
|
||||
|
||||
checkCUDAError( "c_atom->fx malloc", cudaMalloc((void**)&(c_atom->fx), sizeof(MD_FLOAT) * Nlocal * 3) );
|
||||
|
||||
checkCUDAError( "c_atom->vx malloc", cudaMalloc((void**)&(c_atom->vx), sizeof(MD_FLOAT) * Nlocal * 3) );
|
||||
checkCUDAError( "c_atom->vx memcpy", cudaMemcpy(c_atom->vx, atom->vx, sizeof(MD_FLOAT) * Nlocal * 3, cudaMemcpyHostToDevice) );
|
||||
|
||||
checkCUDAError( "c_atom->type malloc", cudaMalloc((void**)&(c_atom->type), sizeof(int) * atom->Nmax) );
|
||||
checkCUDAError( "c_atom->epsilon malloc", cudaMalloc((void**)&(c_atom->epsilon), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
|
||||
checkCUDAError( "c_atom->sigma6 malloc", cudaMalloc((void**)&(c_atom->sigma6), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
|
||||
checkCUDAError( "c_atom->cutforcesq malloc", cudaMalloc((void**)&(c_atom->cutforcesq), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
|
||||
|
||||
checkCUDAError( "c_neighbor->neighbors malloc", cudaMalloc((void**)&c_neighbor->neighbors, sizeof(int) * Nlocal * neighbor->maxneighs) );
|
||||
checkCUDAError( "c_neighbor->numneigh malloc", cudaMalloc((void**)&c_neighbor->numneigh, sizeof(int) * Nlocal) );
|
||||
|
||||
checkCUDAError( "c_atom->type memcpy", cudaMemcpy(c_atom->type, atom->type, sizeof(int) * atom->Nmax, cudaMemcpyHostToDevice) );
|
||||
checkCUDAError( "c_atom->sigma6 memcpy", cudaMemcpy(c_atom->sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
|
||||
checkCUDAError( "c_atom->epsilon memcpy", cudaMemcpy(c_atom->epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
|
||||
|
||||
checkCUDAError( "c_atom->cutforcesq memcpy", cudaMemcpy(c_atom->cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
|
||||
}
|
||||
|
||||
|
||||
double setup(
|
||||
Parameter *param,
|
||||
Eam *eam,
|
||||
Atom *atom,
|
||||
Neighbor *neighbor,
|
||||
Stats *stats)
|
||||
Atom *c_atom,
|
||||
Neighbor *c_neighbor,
|
||||
Stats *stats,
|
||||
const int num_threads_per_block,
|
||||
double* timers)
|
||||
{
|
||||
if(param->force_field == FF_EAM) { initEam(eam, param); }
|
||||
double S, E;
|
||||
@ -96,57 +144,69 @@ double setup(
|
||||
setupThermo(param, atom->Natoms);
|
||||
adjustThermo(param, atom);
|
||||
setupPbc(atom, param);
|
||||
updatePbc(atom, param);
|
||||
buildNeighbor(atom, neighbor);
|
||||
initCudaAtom(atom, neighbor, c_atom, c_neighbor);
|
||||
updatePbc_cuda(atom, param, c_atom, true, num_threads_per_block);
|
||||
buildNeighbor_cuda(atom, neighbor, c_atom, c_neighbor, num_threads_per_block, timers);
|
||||
E = getTimeStamp();
|
||||
|
||||
|
||||
return E-S;
|
||||
}
|
||||
|
||||
double reneighbour(
|
||||
Parameter *param,
|
||||
Atom *atom,
|
||||
Neighbor *neighbor)
|
||||
Neighbor *neighbor,
|
||||
Atom *c_atom,
|
||||
Neighbor *c_neighbor,
|
||||
const int num_threads_per_block,
|
||||
double* timers)
|
||||
{
|
||||
double S, E;
|
||||
double S, E, beforeEvent, afterEvent;
|
||||
|
||||
S = getTimeStamp();
|
||||
beforeEvent = S;
|
||||
LIKWID_MARKER_START("reneighbour");
|
||||
updateAtomsPbc(atom, param);
|
||||
updateAtomsPbc_cuda(atom, param, c_atom, num_threads_per_block);
|
||||
afterEvent = getTimeStamp();
|
||||
timers[NEIGH_UPDATE_ATOMS_PBC] += afterEvent - beforeEvent;
|
||||
beforeEvent = afterEvent;
|
||||
setupPbc(atom, param);
|
||||
updatePbc(atom, param);
|
||||
afterEvent = getTimeStamp();
|
||||
timers[NEIGH_SETUP_PBC] += afterEvent - beforeEvent;
|
||||
beforeEvent = afterEvent;
|
||||
updatePbc_cuda(atom, param, c_atom, true, num_threads_per_block);
|
||||
afterEvent = getTimeStamp();
|
||||
timers[NEIGH_UPDATE_PBC] += afterEvent - beforeEvent;
|
||||
beforeEvent = afterEvent;
|
||||
//sortAtom(atom);
|
||||
buildNeighbor(atom, neighbor);
|
||||
buildNeighbor_cuda(atom, neighbor, c_atom, c_neighbor, num_threads_per_block, timers);
|
||||
LIKWID_MARKER_STOP("reneighbour");
|
||||
E = getTimeStamp();
|
||||
afterEvent = E;
|
||||
timers[NEIGH_BUILD_LISTS] += afterEvent - beforeEvent;
|
||||
|
||||
return E-S;
|
||||
}
|
||||
|
||||
void initialIntegrate(Parameter *param, Atom *atom)
|
||||
{
|
||||
MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
|
||||
MD_FLOAT* vx = atom->vx; MD_FLOAT* vy = atom->vy; MD_FLOAT* vz = atom->vz;
|
||||
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
vx[i] += param->dtforce * fx[i];
|
||||
vy[i] += param->dtforce * fy[i];
|
||||
vz[i] += param->dtforce * fz[i];
|
||||
atom_x(i) = atom_x(i) + param->dt * vx[i];
|
||||
atom_y(i) = atom_y(i) + param->dt * vy[i];
|
||||
atom_z(i) = atom_z(i) + param->dt * vz[i];
|
||||
atom_vx(i) += param->dtforce * atom_fx(i);
|
||||
atom_vy(i) += param->dtforce * atom_fy(i);
|
||||
atom_vz(i) += param->dtforce * atom_fz(i);
|
||||
atom_x(i) = atom_x(i) + param->dt * atom_vx(i);
|
||||
atom_y(i) = atom_y(i) + param->dt * atom_vy(i);
|
||||
atom_z(i) = atom_z(i) + param->dt * atom_vz(i);
|
||||
}
|
||||
}
|
||||
|
||||
void finalIntegrate(Parameter *param, Atom *atom)
|
||||
{
|
||||
MD_FLOAT* fx = atom->fx; MD_FLOAT* fy = atom->fy; MD_FLOAT* fz = atom->fz;
|
||||
MD_FLOAT* vx = atom->vx; MD_FLOAT* vy = atom->vy; MD_FLOAT* vz = atom->vz;
|
||||
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
vx[i] += param->dtforce * fx[i];
|
||||
vy[i] += param->dtforce * fy[i];
|
||||
vz[i] += param->dtforce * fz[i];
|
||||
atom_vx(i) += param->dtforce * atom_fx(i);
|
||||
atom_vy(i) += param->dtforce * atom_fy(i);
|
||||
atom_vz(i) += param->dtforce * atom_fz(i);
|
||||
}
|
||||
}
|
||||
|
||||
@ -176,6 +236,19 @@ const char* ff2str(int ff)
|
||||
return "invalid";
|
||||
}
|
||||
|
||||
int get_num_threads() {
|
||||
|
||||
const char *num_threads_env = getenv("NUM_THREADS");
|
||||
int num_threads = 0;
|
||||
if(num_threads_env == 0)
|
||||
num_threads = 32;
|
||||
else {
|
||||
num_threads = atoi(num_threads_env);
|
||||
}
|
||||
|
||||
return num_threads;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
double timer[NUMTIMER];
|
||||
@ -184,6 +257,8 @@ int main(int argc, char** argv)
|
||||
Neighbor neighbor;
|
||||
Stats stats;
|
||||
Parameter param;
|
||||
Atom c_atom;
|
||||
Neighbor c_neighbor;
|
||||
|
||||
LIKWID_MARKER_INIT;
|
||||
#pragma omp parallel
|
||||
@ -254,7 +329,10 @@ int main(int argc, char** argv)
|
||||
}
|
||||
}
|
||||
|
||||
setup(¶m, &eam, &atom, &neighbor, &stats);
|
||||
// this should be multiple of 32 as operations are performed at the level of warps
|
||||
const int num_threads_per_block = get_num_threads();
|
||||
|
||||
setup(¶m, &eam, &atom, &neighbor, &c_atom, &c_neighbor, &stats, num_threads_per_block, (double*) &timer);
|
||||
computeThermo(0, ¶m, &atom);
|
||||
if(param.force_field == FF_EAM) {
|
||||
computeForceEam(&eam, ¶m, &atom, &neighbor, &stats, 1, 0);
|
||||
@ -262,25 +340,36 @@ int main(int argc, char** argv)
|
||||
#if defined(MEM_TRACER) || defined(INDEX_TRACER) || defined(COMPUTE_STATS)
|
||||
computeForceTracing(¶m, &atom, &neighbor, &stats, 1, 0);
|
||||
#else
|
||||
computeForce(¶m, &atom, &neighbor);
|
||||
computeForce(true, ¶m, &atom, &neighbor, &c_atom, &c_neighbor, num_threads_per_block);
|
||||
#endif
|
||||
}
|
||||
|
||||
timer[FORCE] = 0.0;
|
||||
timer[NEIGH] = 0.0;
|
||||
timer[TOTAL] = getTimeStamp();
|
||||
timer[NEIGH_UPDATE_ATOMS_PBC] = 0.0;
|
||||
timer[NEIGH_SETUP_PBC] = 0.0;
|
||||
timer[NEIGH_UPDATE_PBC] = 0.0;
|
||||
timer[NEIGH_BINATOMS] = 0.0;
|
||||
timer[NEIGH_BUILD_LISTS] = 0.0;
|
||||
|
||||
if(param.vtk_file != NULL) {
|
||||
write_atoms_to_vtk_file(param.vtk_file, &atom, 0);
|
||||
}
|
||||
|
||||
for(int n = 0; n < param.ntimes; n++) {
|
||||
initialIntegrate(¶m, &atom);
|
||||
|
||||
if((n + 1) % param.every) {
|
||||
updatePbc(&atom, ¶m);
|
||||
const bool doReneighbour = (n + 1) % param.every == 0;
|
||||
|
||||
cuda_initial_integrate(doReneighbour, ¶m, &atom, &c_atom, num_threads_per_block);
|
||||
|
||||
if(doReneighbour) {
|
||||
timer[NEIGH] += reneighbour(¶m, &atom, &neighbor, &c_atom, &c_neighbor, num_threads_per_block, (double*) &timer);
|
||||
} else {
|
||||
timer[NEIGH] += reneighbour(¶m, &atom, &neighbor);
|
||||
double before = getTimeStamp();
|
||||
updatePbc_cuda(&atom, ¶m, &c_atom, false, num_threads_per_block);
|
||||
double after = getTimeStamp();
|
||||
timer[NEIGH_UPDATE_PBC] += after - before;
|
||||
}
|
||||
|
||||
if(param.force_field == FF_EAM) {
|
||||
@ -289,12 +378,14 @@ int main(int argc, char** argv)
|
||||
#if defined(MEM_TRACER) || defined(INDEX_TRACER) || defined(COMPUTE_STATS)
|
||||
timer[FORCE] += computeForceTracing(¶m, &atom, &neighbor, &stats, 0, n + 1);
|
||||
#else
|
||||
timer[FORCE] += computeForce(¶m, &atom, &neighbor);
|
||||
timer[FORCE] += computeForce(doReneighbour, ¶m, &atom, &neighbor, &c_atom, &c_neighbor, num_threads_per_block);
|
||||
#endif
|
||||
}
|
||||
finalIntegrate(¶m, &atom);
|
||||
|
||||
cuda_final_integrate(doReneighbour, ¶m, &atom, &c_atom, num_threads_per_block);
|
||||
|
||||
if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) {
|
||||
checkCUDAError("computeThermo atom->x memcpy back", cudaMemcpy(atom.x, c_atom.x, atom.Nmax * sizeof(MD_FLOAT) * 3, cudaMemcpyDeviceToHost) );
|
||||
computeThermo(n + 1, ¶m, &atom);
|
||||
}
|
||||
|
||||
@ -303,6 +394,7 @@ int main(int argc, char** argv)
|
||||
}
|
||||
}
|
||||
|
||||
timer[NEIGH_BUILD_LISTS] -= timer[NEIGH_BINATOMS];
|
||||
timer[TOTAL] = getTimeStamp() - timer[TOTAL];
|
||||
computeThermo(-1, ¶m, &atom);
|
||||
|
||||
@ -316,11 +408,15 @@ int main(int argc, char** argv)
|
||||
#endif
|
||||
printf(HLINE);
|
||||
printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes);
|
||||
printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n",
|
||||
timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH]);
|
||||
printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs NEIGH_TIMERS: UPD_AT: %.2fs SETUP_PBC %.2fs UPDATE_PBC %.2fs BINATOMS %.2fs BUILD_NEIGHBOR %.2fs\n",
|
||||
timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH], timer[NEIGH_UPDATE_ATOMS_PBC], timer[NEIGH_SETUP_PBC], timer[NEIGH_UPDATE_PBC], timer[NEIGH_BINATOMS], timer[NEIGH_BUILD_LISTS]);
|
||||
printf(HLINE);
|
||||
printf("Performance: %.2f million atom updates per second\n",
|
||||
1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]);
|
||||
double atomUpdatesTotal = (double) atom.Natoms * param.ntimes;
|
||||
printf("Force_perf in millions per sec: %.2f\n", 1e-6 * atomUpdatesTotal / timer[FORCE]);
|
||||
double atomNeighUpdatesTotal = (double) atom.Natoms * param.ntimes / param.every;
|
||||
printf("Neighbor_perf in millions per sec: updateAtomsPbc: %.2f setupPbc: %.2f updatePbc: %.2f binAtoms: %.2f buildNeighbor_wo_binning: %.2f\n", 1e-6 * atomNeighUpdatesTotal / timer[NEIGH_UPDATE_ATOMS_PBC], 1e-6 * atomNeighUpdatesTotal / timer[NEIGH_SETUP_PBC], 1e-6 * atomUpdatesTotal / timer[NEIGH_UPDATE_PBC], 1e-6 * atomNeighUpdatesTotal / timer[NEIGH_BINATOMS], 1e-6 * atomNeighUpdatesTotal / timer[NEIGH_BUILD_LISTS]);
|
||||
#ifdef COMPUTE_STATS
|
||||
displayStatistics(&atom, ¶m, &stats, timer);
|
||||
#endif
|
719
lammps/neighbor.cu
Normal file
719
lammps/neighbor.cu
Normal file
@ -0,0 +1,719 @@
|
||||
/*
|
||||
* =======================================================================================
|
||||
*
|
||||
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
|
||||
* Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
|
||||
*
|
||||
* This file is part of MD-Bench.
|
||||
*
|
||||
* MD-Bench is free software: you can redistribute it and/or modify it
|
||||
* under the terms of the GNU Lesser General Public License as published
|
||||
* by the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
|
||||
* details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License along
|
||||
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
|
||||
* =======================================================================================
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include <cuda_profiler_api.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <device_launch_parameters.h>
|
||||
|
||||
extern "C" {
|
||||
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <allocate.h>
|
||||
#include <atom.h>
|
||||
#include <timing.h>
|
||||
#include <timers.h>
|
||||
|
||||
#define SMALL 1.0e-6
|
||||
#define FACTOR 0.999
|
||||
}
|
||||
|
||||
__device__ int coord2bin_device(MD_FLOAT xin, MD_FLOAT yin, MD_FLOAT zin,
|
||||
Neighbor_params np)
|
||||
{
|
||||
int ix, iy, iz;
|
||||
|
||||
if(xin >= np.xprd) {
|
||||
ix = (int)((xin - np.xprd) * np.bininvx) + np.nbinx - np.mbinxlo;
|
||||
} else if(xin >= 0.0) {
|
||||
ix = (int)(xin * np.bininvx) - np.mbinxlo;
|
||||
} else {
|
||||
ix = (int)(xin * np.bininvx) - np.mbinxlo - 1;
|
||||
}
|
||||
|
||||
if(yin >= np.yprd) {
|
||||
iy = (int)((yin - np.yprd) * np.bininvy) + np.nbiny - np.mbinylo;
|
||||
} else if(yin >= 0.0) {
|
||||
iy = (int)(yin * np.bininvy) - np.mbinylo;
|
||||
} else {
|
||||
iy = (int)(yin * np.bininvy) - np.mbinylo - 1;
|
||||
}
|
||||
|
||||
if(zin >= np.zprd) {
|
||||
iz = (int)((zin - np.zprd) * np.bininvz) + np.nbinz - np.mbinzlo;
|
||||
} else if(zin >= 0.0) {
|
||||
iz = (int)(zin * np.bininvz) - np.mbinzlo;
|
||||
} else {
|
||||
iz = (int)(zin * np.bininvz) - np.mbinzlo - 1;
|
||||
}
|
||||
|
||||
return (iz * np.mbiny * np.mbinx + iy * np.mbinx + ix + 1);
|
||||
}
|
||||
|
||||
/* sorts the contents of a bin to make it comparable to the CPU version */
|
||||
/* uses bubble sort since atoms per bin should be relatively small and can be done in situ */
|
||||
__global__ void sort_bin_contents_kernel(int* bincount, int* bins, int mbins, int atoms_per_bin){
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (i >= mbins){
|
||||
return;
|
||||
}
|
||||
|
||||
int atoms_in_bin = bincount[i];
|
||||
int* bin_ptr = &bins[i * atoms_per_bin];
|
||||
int sorted;
|
||||
do {
|
||||
sorted = 1;
|
||||
int tmp;
|
||||
for(int index = 0; index < atoms_in_bin - 1; index++){
|
||||
if (bin_ptr[index] > bin_ptr[index + 1]){
|
||||
tmp = bin_ptr[index];
|
||||
bin_ptr[index] = bin_ptr[index + 1];
|
||||
bin_ptr[index + 1] = tmp;
|
||||
sorted = 0;
|
||||
}
|
||||
}
|
||||
} while (!sorted);
|
||||
}
|
||||
|
||||
__global__ void binatoms_kernel(Atom a, int* bincount, int* bins, int atoms_per_bin, Neighbor_params np, int *resize_needed){
|
||||
Atom* atom = &a;
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int nall = atom->Nlocal + atom->Nghost;
|
||||
if(i >= nall){
|
||||
return;
|
||||
}
|
||||
|
||||
MD_FLOAT x = atom_x(i);
|
||||
MD_FLOAT y = atom_y(i);
|
||||
MD_FLOAT z = atom_z(i);
|
||||
int ibin = coord2bin_device(x, y, z, np);
|
||||
|
||||
int ac = atomicAdd(&bincount[ibin], 1);
|
||||
|
||||
if(ac < atoms_per_bin){
|
||||
bins[ibin * atoms_per_bin + ac] = i;
|
||||
} else {
|
||||
atomicMax(resize_needed, ac);
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np, int nstencil, int* stencil,
|
||||
int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq){
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int Nlocal = a.Nlocal;
|
||||
if( i >= Nlocal ) {
|
||||
return;
|
||||
}
|
||||
|
||||
Atom *atom = &a;
|
||||
Neighbor *neighbor = &neigh;
|
||||
|
||||
int* neighptr = &(neighbor->neighbors[i]);
|
||||
int n = 0;
|
||||
MD_FLOAT xtmp = atom_x(i);
|
||||
MD_FLOAT ytmp = atom_y(i);
|
||||
MD_FLOAT ztmp = atom_z(i);
|
||||
int ibin = coord2bin_device(xtmp, ytmp, ztmp, np);
|
||||
#ifdef EXPLICIT_TYPES
|
||||
int type_i = atom->type[i];
|
||||
#endif
|
||||
for(int k = 0; k < nstencil; k++) {
|
||||
int jbin = ibin + stencil[k];
|
||||
int* loc_bin = &bins[jbin * atoms_per_bin];
|
||||
|
||||
for(int m = 0; m < bincount[jbin]; m++) {
|
||||
int j = loc_bin[m];
|
||||
|
||||
if ( j == i ){
|
||||
continue;
|
||||
}
|
||||
|
||||
MD_FLOAT delx = xtmp - atom_x(j);
|
||||
MD_FLOAT dely = ytmp - atom_y(j);
|
||||
MD_FLOAT delz = ztmp - atom_z(j);
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
int type_j = atom->type[j];
|
||||
const MD_FLOAT cutoff = atom->cutneighsq[type_i * atom->ntypes + type_j];
|
||||
#else
|
||||
const MD_FLOAT cutoff = cutneighsq;
|
||||
#endif
|
||||
|
||||
if( rsq <= cutoff ) {
|
||||
int idx = atom->Nlocal * n;
|
||||
neighptr[idx] = j;
|
||||
n += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
neighbor->numneigh[i] = n;
|
||||
|
||||
if(n > neighbor->maxneighs) {
|
||||
atomicMax(new_maxneighs, n);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
|
||||
|
||||
static MD_FLOAT xprd, yprd, zprd;
|
||||
static MD_FLOAT bininvx, bininvy, bininvz;
|
||||
static int mbinxlo, mbinylo, mbinzlo;
|
||||
static int nbinx, nbiny, nbinz;
|
||||
static int mbinx, mbiny, mbinz; // n bins in x, y, z
|
||||
static int *bincount;
|
||||
static int *bins;
|
||||
static int mbins; //total number of bins
|
||||
static int atoms_per_bin; // max atoms per bin
|
||||
static MD_FLOAT cutneigh;
|
||||
static MD_FLOAT cutneighsq; // neighbor cutoff squared
|
||||
static int nmax;
|
||||
static int nstencil; // # of bins in stencil
|
||||
static int* stencil; // stencil list of bin offsets
|
||||
static MD_FLOAT binsizex, binsizey, binsizez;
|
||||
|
||||
static int* c_stencil = NULL;
|
||||
static int* c_resize_needed = NULL;
|
||||
static int* c_new_maxneighs = NULL;
|
||||
static Binning c_binning{
|
||||
.bincount = NULL,
|
||||
.bins = NULL,
|
||||
.mbins = 0,
|
||||
.atoms_per_bin = 0
|
||||
};
|
||||
|
||||
|
||||
static int coord2bin(MD_FLOAT, MD_FLOAT , MD_FLOAT);
|
||||
static MD_FLOAT bindist(int, int, int);
|
||||
|
||||
/* exported subroutines */
|
||||
void initNeighbor(Neighbor *neighbor, Parameter *param)
|
||||
{
|
||||
MD_FLOAT neighscale = 5.0 / 6.0;
|
||||
xprd = param->nx * param->lattice;
|
||||
yprd = param->ny * param->lattice;
|
||||
zprd = param->nz * param->lattice;
|
||||
cutneigh = param->cutneigh;
|
||||
nbinx = neighscale * param->nx;
|
||||
nbiny = neighscale * param->ny;
|
||||
nbinz = neighscale * param->nz;
|
||||
nmax = 0;
|
||||
atoms_per_bin = 8;
|
||||
stencil = NULL;
|
||||
bins = NULL;
|
||||
bincount = NULL;
|
||||
neighbor->maxneighs = 100;
|
||||
neighbor->numneigh = NULL;
|
||||
neighbor->neighbors = NULL;
|
||||
}
|
||||
|
||||
void setupNeighbor()
|
||||
{
|
||||
MD_FLOAT coord;
|
||||
int mbinxhi, mbinyhi, mbinzhi;
|
||||
int nextx, nexty, nextz;
|
||||
MD_FLOAT xlo = 0.0; MD_FLOAT xhi = xprd;
|
||||
MD_FLOAT ylo = 0.0; MD_FLOAT yhi = yprd;
|
||||
MD_FLOAT zlo = 0.0; MD_FLOAT zhi = zprd;
|
||||
|
||||
cutneighsq = cutneigh * cutneigh;
|
||||
binsizex = xprd / nbinx;
|
||||
binsizey = yprd / nbiny;
|
||||
binsizez = zprd / nbinz;
|
||||
bininvx = 1.0 / binsizex;
|
||||
bininvy = 1.0 / binsizey;
|
||||
bininvz = 1.0 / binsizez;
|
||||
|
||||
coord = xlo - cutneigh - SMALL * xprd;
|
||||
mbinxlo = (int) (coord * bininvx);
|
||||
if (coord < 0.0) {
|
||||
mbinxlo = mbinxlo - 1;
|
||||
}
|
||||
coord = xhi + cutneigh + SMALL * xprd;
|
||||
mbinxhi = (int) (coord * bininvx);
|
||||
|
||||
coord = ylo - cutneigh - SMALL * yprd;
|
||||
mbinylo = (int) (coord * bininvy);
|
||||
if (coord < 0.0) {
|
||||
mbinylo = mbinylo - 1;
|
||||
}
|
||||
coord = yhi + cutneigh + SMALL * yprd;
|
||||
mbinyhi = (int) (coord * bininvy);
|
||||
|
||||
coord = zlo - cutneigh - SMALL * zprd;
|
||||
mbinzlo = (int) (coord * bininvz);
|
||||
if (coord < 0.0) {
|
||||
mbinzlo = mbinzlo - 1;
|
||||
}
|
||||
coord = zhi + cutneigh + SMALL * zprd;
|
||||
mbinzhi = (int) (coord * bininvz);
|
||||
|
||||
mbinxlo = mbinxlo - 1;
|
||||
mbinxhi = mbinxhi + 1;
|
||||
mbinx = mbinxhi - mbinxlo + 1;
|
||||
|
||||
mbinylo = mbinylo - 1;
|
||||
mbinyhi = mbinyhi + 1;
|
||||
mbiny = mbinyhi - mbinylo + 1;
|
||||
|
||||
mbinzlo = mbinzlo - 1;
|
||||
mbinzhi = mbinzhi + 1;
|
||||
mbinz = mbinzhi - mbinzlo + 1;
|
||||
|
||||
nextx = (int) (cutneigh * bininvx);
|
||||
if(nextx * binsizex < FACTOR * cutneigh) nextx++;
|
||||
|
||||
nexty = (int) (cutneigh * bininvy);
|
||||
if(nexty * binsizey < FACTOR * cutneigh) nexty++;
|
||||
|
||||
nextz = (int) (cutneigh * bininvz);
|
||||
if(nextz * binsizez < FACTOR * cutneigh) nextz++;
|
||||
|
||||
if (stencil) {
|
||||
free(stencil);
|
||||
}
|
||||
|
||||
stencil = (int*) malloc(
|
||||
(2 * nextz + 1) * (2 * nexty + 1) * (2 * nextx + 1) * sizeof(int));
|
||||
|
||||
nstencil = 0;
|
||||
int kstart = -nextz;
|
||||
|
||||
for(int k = kstart; k <= nextz; k++) {
|
||||
for(int j = -nexty; j <= nexty; j++) {
|
||||
for(int i = -nextx; i <= nextx; i++) {
|
||||
if(bindist(i, j, k) < cutneighsq) {
|
||||
stencil[nstencil++] =
|
||||
k * mbiny * mbinx + j * mbinx + i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mbins = mbinx * mbiny * mbinz;
|
||||
|
||||
if (bincount) {
|
||||
free(bincount);
|
||||
}
|
||||
bincount = (int*) malloc(mbins * sizeof(int));
|
||||
|
||||
if (bins) {
|
||||
free(bins);
|
||||
}
|
||||
bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int));
|
||||
}
|
||||
|
||||
void buildNeighbor(Atom *atom, Neighbor *neighbor)
|
||||
{
|
||||
int nall = atom->Nlocal + atom->Nghost;
|
||||
|
||||
/* extend atom arrays if necessary */
|
||||
if(nall > nmax) {
|
||||
nmax = nall;
|
||||
if(neighbor->numneigh) cudaFreeHost(neighbor->numneigh);
|
||||
if(neighbor->neighbors) cudaFreeHost(neighbor->neighbors);
|
||||
checkCUDAError( "buildNeighbor numneigh", cudaMallocHost((void**)&(neighbor->numneigh), nmax * sizeof(int)) );
|
||||
checkCUDAError( "buildNeighbor neighbors", cudaMallocHost((void**)&(neighbor->neighbors), nmax * neighbor->maxneighs * sizeof(int)) );
|
||||
// neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
|
||||
// neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*));
|
||||
}
|
||||
|
||||
/* bin local & ghost atoms */
|
||||
binatoms(atom);
|
||||
int resize = 1;
|
||||
|
||||
/* loop over each atom, storing neighbors */
|
||||
while(resize) {
|
||||
int new_maxneighs = neighbor->maxneighs;
|
||||
resize = 0;
|
||||
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
int* neighptr = &(neighbor->neighbors[i]);
|
||||
int n = 0;
|
||||
MD_FLOAT xtmp = atom_x(i);
|
||||
MD_FLOAT ytmp = atom_y(i);
|
||||
MD_FLOAT ztmp = atom_z(i);
|
||||
int ibin = coord2bin(xtmp, ytmp, ztmp);
|
||||
#ifdef EXPLICIT_TYPES
|
||||
int type_i = atom->type[i];
|
||||
#endif
|
||||
for(int k = 0; k < nstencil; k++) {
|
||||
int jbin = ibin + stencil[k];
|
||||
int* loc_bin = &bins[jbin * atoms_per_bin];
|
||||
|
||||
for(int m = 0; m < bincount[jbin]; m++) {
|
||||
int j = loc_bin[m];
|
||||
|
||||
if ( j == i ){
|
||||
continue;
|
||||
}
|
||||
|
||||
MD_FLOAT delx = xtmp - atom_x(j);
|
||||
MD_FLOAT dely = ytmp - atom_y(j);
|
||||
MD_FLOAT delz = ztmp - atom_z(j);
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
int type_j = atom->type[j];
|
||||
const MD_FLOAT cutoff = atom->cutneighsq[type_i * atom->ntypes + type_j];
|
||||
#else
|
||||
const MD_FLOAT cutoff = cutneighsq;
|
||||
#endif
|
||||
|
||||
if( rsq <= cutoff ) {
|
||||
int idx = atom->Nlocal * n;
|
||||
neighptr[idx] = j;
|
||||
n += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
neighbor->numneigh[i] = n;
|
||||
|
||||
if(n >= neighbor->maxneighs) {
|
||||
resize = 1;
|
||||
|
||||
if(n >= new_maxneighs) {
|
||||
new_maxneighs = n;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(resize) {
|
||||
printf("RESIZE %d\n", neighbor->maxneighs);
|
||||
neighbor->maxneighs = new_maxneighs * 1.2;
|
||||
free(neighbor->neighbors);
|
||||
neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* internal subroutines */
|
||||
MD_FLOAT bindist(int i, int j, int k)
|
||||
{
|
||||
MD_FLOAT delx, dely, delz;
|
||||
|
||||
if(i > 0) {
|
||||
delx = (i - 1) * binsizex;
|
||||
} else if(i == 0) {
|
||||
delx = 0.0;
|
||||
} else {
|
||||
delx = (i + 1) * binsizex;
|
||||
}
|
||||
|
||||
if(j > 0) {
|
||||
dely = (j - 1) * binsizey;
|
||||
} else if(j == 0) {
|
||||
dely = 0.0;
|
||||
} else {
|
||||
dely = (j + 1) * binsizey;
|
||||
}
|
||||
|
||||
if(k > 0) {
|
||||
delz = (k - 1) * binsizez;
|
||||
} else if(k == 0) {
|
||||
delz = 0.0;
|
||||
} else {
|
||||
delz = (k + 1) * binsizez;
|
||||
}
|
||||
|
||||
return (delx * delx + dely * dely + delz * delz);
|
||||
}
|
||||
|
||||
int coord2bin(MD_FLOAT xin, MD_FLOAT yin, MD_FLOAT zin)
|
||||
{
|
||||
int ix, iy, iz;
|
||||
|
||||
if(xin >= xprd) {
|
||||
ix = (int)((xin - xprd) * bininvx) + nbinx - mbinxlo;
|
||||
} else if(xin >= 0.0) {
|
||||
ix = (int)(xin * bininvx) - mbinxlo;
|
||||
} else {
|
||||
ix = (int)(xin * bininvx) - mbinxlo - 1;
|
||||
}
|
||||
|
||||
if(yin >= yprd) {
|
||||
iy = (int)((yin - yprd) * bininvy) + nbiny - mbinylo;
|
||||
} else if(yin >= 0.0) {
|
||||
iy = (int)(yin * bininvy) - mbinylo;
|
||||
} else {
|
||||
iy = (int)(yin * bininvy) - mbinylo - 1;
|
||||
}
|
||||
|
||||
if(zin >= zprd) {
|
||||
iz = (int)((zin - zprd) * bininvz) + nbinz - mbinzlo;
|
||||
} else if(zin >= 0.0) {
|
||||
iz = (int)(zin * bininvz) - mbinzlo;
|
||||
} else {
|
||||
iz = (int)(zin * bininvz) - mbinzlo - 1;
|
||||
}
|
||||
|
||||
return (iz * mbiny * mbinx + iy * mbinx + ix + 1);
|
||||
}
|
||||
|
||||
void binatoms(Atom *atom)
|
||||
{
|
||||
int nall = atom->Nlocal + atom->Nghost;
|
||||
int resize = 1;
|
||||
|
||||
while(resize > 0) {
|
||||
resize = 0;
|
||||
|
||||
for(int i = 0; i < mbins; i++) {
|
||||
bincount[i] = 0;
|
||||
}
|
||||
|
||||
for(int i = 0; i < nall; i++) {
|
||||
MD_FLOAT x = atom_x(i);
|
||||
MD_FLOAT y = atom_y(i);
|
||||
MD_FLOAT z = atom_z(i);
|
||||
int ibin = coord2bin(x, y, z);
|
||||
|
||||
if(bincount[ibin] < atoms_per_bin) {
|
||||
int ac = bincount[ibin]++;
|
||||
bins[ibin * atoms_per_bin + ac] = i;
|
||||
} else {
|
||||
resize = 1;
|
||||
}
|
||||
}
|
||||
|
||||
if(resize) {
|
||||
free(bins);
|
||||
atoms_per_bin *= 2;
|
||||
bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void sortAtom(Atom* atom) {
|
||||
binatoms(atom);
|
||||
int Nmax = atom->Nmax;
|
||||
int* binpos = bincount;
|
||||
|
||||
for(int i=1; i<mbins; i++) {
|
||||
binpos[i] += binpos[i-1];
|
||||
}
|
||||
|
||||
#ifdef AOS
|
||||
MD_FLOAT* new_x = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
|
||||
|
||||
MD_FLOAT* new_vx = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
|
||||
#else
|
||||
MD_FLOAT* new_x = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||
MD_FLOAT* new_y = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||
MD_FLOAT* new_z = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||
|
||||
MD_FLOAT* new_vx = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||
MD_FLOAT* new_vy = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||
MD_FLOAT* new_vz = (MD_FLOAT*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||
#endif
|
||||
|
||||
MD_FLOAT* old_x = atom->x; MD_FLOAT* old_y = atom->y; MD_FLOAT* old_z = atom->z;
|
||||
MD_FLOAT* old_vx = atom->vx; MD_FLOAT* old_vy = atom->vy; MD_FLOAT* old_vz = atom->vz;
|
||||
|
||||
for(int mybin = 0; mybin<mbins; mybin++) {
|
||||
int start = mybin>0?binpos[mybin-1]:0;
|
||||
int count = binpos[mybin] - start;
|
||||
for(int k=0; k<count; k++) {
|
||||
int new_i = start + k;
|
||||
int old_i = bins[mybin * atoms_per_bin + k];
|
||||
#ifdef AOS
|
||||
new_x[new_i * 3 + 0] = old_x[old_i * 3 + 0];
|
||||
new_x[new_i * 3 + 1] = old_x[old_i * 3 + 1];
|
||||
new_x[new_i * 3 + 2] = old_x[old_i * 3 + 2];
|
||||
|
||||
new_vx[new_i * 3 + 0] = old_vx[old_i * 3 + 0];
|
||||
new_vx[new_i * 3 + 1] = old_vy[old_i * 3 + 1];
|
||||
new_vx[new_i * 3 + 2] = old_vz[old_i * 3 + 2];
|
||||
#else
|
||||
new_x[new_i] = old_x[old_i];
|
||||
new_y[new_i] = old_y[old_i];
|
||||
new_z[new_i] = old_z[old_i];
|
||||
|
||||
new_vx[new_i] = old_vx[old_i];
|
||||
new_vy[new_i] = old_vy[old_i];
|
||||
new_vz[new_i] = old_vz[old_i];
|
||||
#endif
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
free(atom->x);
|
||||
atom->x = new_x;
|
||||
|
||||
free(atom->vx);
|
||||
atom->vx = new_vx;
|
||||
#ifndef AOS
|
||||
free(atom->y);
|
||||
free(atom->z);
|
||||
atom->y = new_y; atom->z = new_z;
|
||||
|
||||
free(atom->vy); free(atom->vz);
|
||||
atom->vy = new_vy; atom->vz = new_vz;
|
||||
#endif
|
||||
}
|
||||
|
||||
void binatoms_cuda(Atom* c_atom, Binning* c_binning, int* c_resize_needed, Neighbor_params *np, const int threads_per_block)
|
||||
{
|
||||
int nall = c_atom->Nlocal + c_atom->Nghost;
|
||||
int resize = 1;
|
||||
|
||||
const int num_blocks = ceil((float)nall / (float)threads_per_block);
|
||||
|
||||
while(resize > 0) {
|
||||
resize = 0;
|
||||
checkCUDAError("binatoms_cuda c_binning->bincount memset", cudaMemset(c_binning->bincount, 0, c_binning->mbins * sizeof(int)));
|
||||
checkCUDAError("binatoms_cuda c_resize_needed memset", cudaMemset(c_resize_needed, 0, sizeof(int)) );
|
||||
|
||||
/*binatoms_kernel(Atom a, int* bincount, int* bins, int c_binning->atoms_per_bin, Neighbor_params np, int *resize_needed) */
|
||||
binatoms_kernel<<<num_blocks, threads_per_block>>>(*c_atom, c_binning->bincount, c_binning->bins, c_binning->atoms_per_bin, *np, c_resize_needed);
|
||||
|
||||
checkCUDAError( "PeekAtLastError binatoms kernel", cudaPeekAtLastError() );
|
||||
checkCUDAError( "DeviceSync binatoms kernel", cudaDeviceSynchronize() );
|
||||
|
||||
checkCUDAError("binatoms_cuda c_resize_needed memcpy back", cudaMemcpy(&resize, c_resize_needed, sizeof(int), cudaMemcpyDeviceToHost) );
|
||||
|
||||
if(resize) {
|
||||
cudaFree(c_binning->bins);
|
||||
c_binning->atoms_per_bin *= 2;
|
||||
checkCUDAError("binatoms_cuda c_binning->bins resize malloc", cudaMalloc(&c_binning->bins, c_binning->mbins * c_binning->atoms_per_bin * sizeof(int)) );
|
||||
}
|
||||
}
|
||||
atoms_per_bin = c_binning->atoms_per_bin;
|
||||
const int sortBlocks = ceil((float)mbins / (float)threads_per_block);
|
||||
/*void sort_bin_contents_kernel(int* bincount, int* bins, int mbins, int atoms_per_bin)*/
|
||||
sort_bin_contents_kernel<<<sortBlocks, threads_per_block>>>(c_binning->bincount, c_binning->bins, c_binning->mbins, c_binning->atoms_per_bin);
|
||||
checkCUDAError( "PeekAtLastError sort_bin_contents kernel", cudaPeekAtLastError() );
|
||||
checkCUDAError( "DeviceSync sort_bin_contents kernel", cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor, const int num_threads_per_block, double* timers)
|
||||
{
|
||||
int nall = atom->Nlocal + atom->Nghost;
|
||||
c_neighbor->maxneighs = neighbor->maxneighs;
|
||||
|
||||
cudaProfilerStart();
|
||||
/* upload stencil */
|
||||
// TODO move all of this initialization into its own method
|
||||
if(c_stencil == NULL){
|
||||
checkCUDAError( "buildNeighbor c_n_stencil malloc", cudaMalloc((void**)&c_stencil, nstencil * sizeof(int)) );
|
||||
checkCUDAError( "buildNeighbor c_n_stencil memcpy", cudaMemcpy(c_stencil, stencil, nstencil * sizeof(int), cudaMemcpyHostToDevice ));
|
||||
}
|
||||
|
||||
if(c_binning.mbins == 0){
|
||||
c_binning.mbins = mbins;
|
||||
c_binning.atoms_per_bin = atoms_per_bin;
|
||||
checkCUDAError( "buildNeighbor c_binning->bincount malloc", cudaMalloc((void**)&(c_binning.bincount), c_binning.mbins * sizeof(int)) );
|
||||
checkCUDAError( "buidlNeighbor c_binning->bins malloc", cudaMalloc((void**)&(c_binning.bins), c_binning.mbins * c_binning.atoms_per_bin * sizeof(int)) );
|
||||
}
|
||||
|
||||
Neighbor_params np{
|
||||
.xprd = xprd,
|
||||
.yprd = yprd,
|
||||
.zprd = zprd,
|
||||
.bininvx = bininvx,
|
||||
.bininvy = bininvy,
|
||||
.bininvz = bininvz,
|
||||
.mbinxlo = mbinxlo,
|
||||
.mbinylo = mbinylo,
|
||||
.mbinzlo = mbinzlo,
|
||||
.nbinx = nbinx,
|
||||
.nbiny = nbiny,
|
||||
.nbinz = nbinz,
|
||||
.mbinx = mbinx,
|
||||
.mbiny = mbiny,
|
||||
.mbinz = mbinz
|
||||
};
|
||||
|
||||
if(c_resize_needed == NULL){
|
||||
checkCUDAError("buildNeighbor c_resize_needed malloc", cudaMalloc((void**)&c_resize_needed, sizeof(int)) );
|
||||
}
|
||||
|
||||
/* bin local & ghost atoms */
|
||||
double beforeBinning = getTimeStamp();
|
||||
binatoms_cuda(c_atom, &c_binning, c_resize_needed, &np, num_threads_per_block);
|
||||
double afterBinning = getTimeStamp();
|
||||
timers[NEIGH_BINATOMS] += afterBinning - beforeBinning;
|
||||
|
||||
if(c_new_maxneighs == NULL){
|
||||
checkCUDAError("c_new_maxneighs malloc", cudaMalloc((void**)&c_new_maxneighs, sizeof(int) ));
|
||||
}
|
||||
|
||||
int resize = 1;
|
||||
|
||||
/* extend c_neighbor arrays if necessary */
|
||||
if(nall > nmax) {
|
||||
nmax = nall;
|
||||
if(c_neighbor->numneigh) cudaFree(c_neighbor->numneigh);
|
||||
if(c_neighbor->neighbors) cudaFree(c_neighbor->neighbors);
|
||||
checkCUDAError( "buildNeighbor c_numneigh malloc", cudaMalloc((void**)&(c_neighbor->numneigh), nmax * sizeof(int)) );
|
||||
checkCUDAError( "buildNeighbor c_neighbors malloc", cudaMalloc((void**)&(c_neighbor->neighbors), nmax * c_neighbor->maxneighs * sizeof(int)) );
|
||||
}
|
||||
|
||||
/* loop over each atom, storing neighbors */
|
||||
while(resize) {
|
||||
resize = 0;
|
||||
|
||||
checkCUDAError("c_new_maxneighs memset", cudaMemset(c_new_maxneighs, 0, sizeof(int) ));
|
||||
|
||||
// TODO call compute_neigborhood kernel here
|
||||
const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
|
||||
/*compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np, int nstencil, int* stencil,
|
||||
int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs)
|
||||
* */
|
||||
compute_neighborhood<<<num_blocks, num_threads_per_block>>>(*c_atom, *c_neighbor,
|
||||
np, nstencil, c_stencil,
|
||||
c_binning.bins, c_binning.atoms_per_bin, c_binning.bincount,
|
||||
c_new_maxneighs,
|
||||
cutneighsq);
|
||||
|
||||
checkCUDAError( "PeekAtLastError ComputeNeighbor", cudaPeekAtLastError() );
|
||||
checkCUDAError( "DeviceSync ComputeNeighbor", cudaDeviceSynchronize() );
|
||||
|
||||
// TODO copy the value of c_new_maxneighs back to host and check if it has been modified
|
||||
int new_maxneighs;
|
||||
checkCUDAError("c_new_maxneighs memcpy back", cudaMemcpy(&new_maxneighs, c_new_maxneighs, sizeof(int), cudaMemcpyDeviceToHost));
|
||||
if (new_maxneighs > c_neighbor->maxneighs){
|
||||
resize = 1;
|
||||
}
|
||||
|
||||
if(resize) {
|
||||
printf("RESIZE %d\n", c_neighbor->maxneighs);
|
||||
c_neighbor->maxneighs = new_maxneighs * 1.2;
|
||||
printf("NEW SIZE %d\n", c_neighbor->maxneighs);
|
||||
cudaFree(c_neighbor->neighbors);
|
||||
checkCUDAError("c_neighbor->neighbors resize malloc",
|
||||
cudaMalloc((void**)(&c_neighbor->neighbors),
|
||||
c_atom->Nmax * c_neighbor->maxneighs * sizeof(int)));
|
||||
}
|
||||
|
||||
}
|
||||
neighbor->maxneighs = c_neighbor->maxneighs;
|
||||
|
||||
cudaProfilerStop();
|
||||
}
|
||||
}
|
286
lammps/pbc.cu
Normal file
286
lammps/pbc.cu
Normal file
@ -0,0 +1,286 @@
|
||||
/*
|
||||
* =======================================================================================
|
||||
*
|
||||
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
|
||||
* Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
|
||||
*
|
||||
* This file is part of MD-Bench.
|
||||
*
|
||||
* MD-Bench is free software: you can redistribute it and/or modify it
|
||||
* under the terms of the GNU Lesser General Public License as published
|
||||
* by the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
|
||||
* details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License along
|
||||
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
|
||||
* =======================================================================================
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
extern "C" {
|
||||
|
||||
#include <pbc.h>
|
||||
#include <atom.h>
|
||||
#include <allocate.h>
|
||||
|
||||
#define DELTA 20000
|
||||
|
||||
}
|
||||
|
||||
__global__ void computeAtomsPbcUpdate(Atom a, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd){
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
Atom* atom = &a;
|
||||
if( i >= atom->Nlocal ){
|
||||
return;
|
||||
}
|
||||
|
||||
if (atom_x(i) < 0.0) {
|
||||
atom_x(i) += xprd;
|
||||
} else if (atom_x(i) >= xprd) {
|
||||
atom_x(i) -= xprd;
|
||||
}
|
||||
|
||||
if (atom_y(i) < 0.0) {
|
||||
atom_y(i) += yprd;
|
||||
} else if (atom_y(i) >= yprd) {
|
||||
atom_y(i) -= yprd;
|
||||
}
|
||||
|
||||
if (atom_z(i) < 0.0) {
|
||||
atom_z(i) += zprd;
|
||||
} else if (atom_z(i) >= zprd) {
|
||||
atom_z(i) -= zprd;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void computePbcUpdate(Atom a, int* PBCx, int* PBCy, int* PBCz, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd){
|
||||
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const int Nghost = a.Nghost;
|
||||
if( i >= Nghost ) {
|
||||
return;
|
||||
}
|
||||
Atom* atom = &a;
|
||||
int *border_map = atom->border_map;
|
||||
int nlocal = atom->Nlocal;
|
||||
|
||||
atom_x(nlocal + i) = atom_x(border_map[i]) + PBCx[i] * xprd;
|
||||
atom_y(nlocal + i) = atom_y(border_map[i]) + PBCy[i] * yprd;
|
||||
atom_z(nlocal + i) = atom_z(border_map[i]) + PBCz[i] * zprd;
|
||||
}
|
||||
|
||||
extern "C"{
|
||||
|
||||
static int NmaxGhost;
|
||||
static int *PBCx, *PBCy, *PBCz;
|
||||
|
||||
static int c_NmaxGhost = 0;
|
||||
static int *c_PBCx = NULL, *c_PBCy = NULL, *c_PBCz = NULL;
|
||||
|
||||
static void growPbc(Atom *);
|
||||
|
||||
/* exported subroutines */
|
||||
void initPbc(Atom *atom) {
|
||||
NmaxGhost = 0;
|
||||
atom->border_map = NULL;
|
||||
PBCx = NULL;
|
||||
PBCy = NULL;
|
||||
PBCz = NULL;
|
||||
}
|
||||
|
||||
/* update coordinates of ghost atoms */
|
||||
/* uses mapping created in setupPbc */
|
||||
void updatePbc(Atom *atom, Parameter *param) {
|
||||
int *border_map = atom->border_map;
|
||||
int nlocal = atom->Nlocal;
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
|
||||
for (int i = 0; i < atom->Nghost; i++) {
|
||||
atom_x(nlocal + i) = atom_x(border_map[i]) + PBCx[i] * xprd;
|
||||
atom_y(nlocal + i) = atom_y(border_map[i]) + PBCy[i] * yprd;
|
||||
atom_z(nlocal + i) = atom_z(border_map[i]) + PBCz[i] * zprd;
|
||||
}
|
||||
}
|
||||
|
||||
/* update coordinates of ghost atoms */
|
||||
/* uses mapping created in setupPbc */
|
||||
void updatePbc_cuda(Atom *atom, Parameter *param, Atom *c_atom, bool doReneighbor, const int num_threads_per_block) {
|
||||
if (doReneighbor){
|
||||
c_atom->Natoms = atom->Natoms;
|
||||
c_atom->Nlocal = atom->Nlocal;
|
||||
c_atom->Nghost = atom->Nghost;
|
||||
c_atom->ntypes = atom->ntypes;
|
||||
|
||||
if (atom->Nmax > c_atom->Nmax){ // the number of ghost atoms has increased -> more space is needed
|
||||
c_atom->Nmax = atom->Nmax;
|
||||
if(c_atom->x != NULL){ cudaFree(c_atom->x); }
|
||||
if(c_atom->type != NULL){ cudaFree(c_atom->type); }
|
||||
checkCUDAError( "updatePbc c_atom->x malloc", cudaMalloc((void**)&(c_atom->x), sizeof(MD_FLOAT) * atom->Nmax * 3) );
|
||||
checkCUDAError( "updatePbc c_atom->type malloc", cudaMalloc((void**)&(c_atom->type), sizeof(int) * atom->Nmax) );
|
||||
}
|
||||
// TODO if the sort is reactivated the atom->vx needs to be copied to GPU as well
|
||||
checkCUDAError( "updatePbc c_atom->x memcpy", cudaMemcpy(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) );
|
||||
checkCUDAError( "updatePbc c_atom->type memcpy", cudaMemcpy(c_atom->type, atom->type, sizeof(int) * atom->Nmax, cudaMemcpyHostToDevice) );
|
||||
|
||||
if(c_NmaxGhost < NmaxGhost){
|
||||
c_NmaxGhost = NmaxGhost;
|
||||
if(c_PBCx != NULL){ cudaFree(c_PBCx); }
|
||||
if(c_PBCy != NULL){ cudaFree(c_PBCy); }
|
||||
if(c_PBCz != NULL){ cudaFree(c_PBCz); }
|
||||
if(c_atom->border_map != NULL){ cudaFree(c_atom->border_map); }
|
||||
checkCUDAError( "updatePbc c_PBCx malloc", cudaMalloc((void**)&c_PBCx, NmaxGhost * sizeof(int)) );
|
||||
checkCUDAError( "updatePbc c_PBCy malloc", cudaMalloc((void**)&c_PBCy, NmaxGhost * sizeof(int)) );
|
||||
checkCUDAError( "updatePbc c_PBCz malloc", cudaMalloc((void**)&c_PBCz, NmaxGhost * sizeof(int)) );
|
||||
checkCUDAError( "updatePbc c_atom->border_map malloc", cudaMalloc((void**)&(c_atom->border_map), NmaxGhost * sizeof(int)) );
|
||||
}
|
||||
checkCUDAError( "updatePbc c_PBCx memcpy", cudaMemcpy(c_PBCx, PBCx, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) );
|
||||
checkCUDAError( "updatePbc c_PBCy memcpy", cudaMemcpy(c_PBCy, PBCy, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) );
|
||||
checkCUDAError( "updatePbc c_PBCz memcpy", cudaMemcpy(c_PBCz, PBCz, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) );
|
||||
checkCUDAError( "updatePbc c_atom->border_map memcpy", cudaMemcpy(c_atom->border_map, atom->border_map, NmaxGhost * sizeof(int), cudaMemcpyHostToDevice) );
|
||||
}
|
||||
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
|
||||
const int num_blocks = ceil((float)atom->Nghost / (float)num_threads_per_block);
|
||||
|
||||
/*__global__ void computePbcUpdate(Atom a, int* PBCx, int* PBCy, int* PBCz,
|
||||
* MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd)
|
||||
* */
|
||||
computePbcUpdate<<<num_blocks, num_threads_per_block>>>(*c_atom, c_PBCx, c_PBCy, c_PBCz, xprd, yprd, zprd);
|
||||
checkCUDAError( "PeekAtLastError UpdatePbc", cudaPeekAtLastError() );
|
||||
checkCUDAError( "DeviceSync UpdatePbc", cudaDeviceSynchronize() );
|
||||
}
|
||||
|
||||
/* relocate atoms that have left domain according
|
||||
* to periodic boundary conditions */
|
||||
void updateAtomsPbc(Atom *atom, Parameter *param) {
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
|
||||
for (int i = 0; i < atom->Nlocal; i++) {
|
||||
|
||||
if (atom_x(i) < 0.0) {
|
||||
atom_x(i) += xprd;
|
||||
} else if (atom_x(i) >= xprd) {
|
||||
atom_x(i) -= xprd;
|
||||
}
|
||||
|
||||
if (atom_y(i) < 0.0) {
|
||||
atom_y(i) += yprd;
|
||||
} else if (atom_y(i) >= yprd) {
|
||||
atom_y(i) -= yprd;
|
||||
}
|
||||
|
||||
if (atom_z(i) < 0.0) {
|
||||
atom_z(i) += zprd;
|
||||
} else if (atom_z(i) >= zprd) {
|
||||
atom_z(i) -= zprd;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void updateAtomsPbc_cuda(Atom* atom, Parameter* param, Atom* c_atom, const int num_threads_per_block){
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
|
||||
const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block);
|
||||
/*void computeAtomsPbcUpdate(Atom a, MD_FLOAT xprd, MD_FLOAT yprd, MD_FLOAT zprd)*/
|
||||
computeAtomsPbcUpdate<<<num_blocks, num_threads_per_block>>>(*c_atom, xprd, yprd, zprd);
|
||||
|
||||
checkCUDAError( "PeekAtLastError UpdateAtomsPbc", cudaPeekAtLastError() );
|
||||
checkCUDAError( "DeviceSync UpdateAtomsPbc", cudaDeviceSynchronize() );
|
||||
|
||||
checkCUDAError( "updateAtomsPbc position memcpy back", cudaMemcpy(atom->x, c_atom->x, sizeof(MD_FLOAT) * atom->Nlocal * 3, cudaMemcpyDeviceToHost) );
|
||||
}
|
||||
|
||||
/* setup periodic boundary conditions by
|
||||
* defining ghost atoms around domain
|
||||
* only creates mapping and coordinate corrections
|
||||
* that are then enforced in updatePbc */
|
||||
#define ADDGHOST(dx, dy, dz) \
|
||||
Nghost++; \
|
||||
border_map[Nghost] = i; \
|
||||
PBCx[Nghost] = dx; \
|
||||
PBCy[Nghost] = dy; \
|
||||
PBCz[Nghost] = dz; \
|
||||
atom->type[atom->Nlocal + Nghost] = atom->type[i]
|
||||
|
||||
void setupPbc(Atom *atom, Parameter *param) {
|
||||
int *border_map = atom->border_map;
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
MD_FLOAT Cutneigh = param->cutneigh;
|
||||
int Nghost = -1;
|
||||
|
||||
for (int i = 0; i < atom->Nlocal; i++) {
|
||||
|
||||
if (atom->Nlocal + Nghost + 7 >= atom->Nmax) {
|
||||
growAtom(atom);
|
||||
}
|
||||
if (Nghost + 7 >= NmaxGhost) {
|
||||
growPbc(atom);
|
||||
border_map = atom->border_map;
|
||||
}
|
||||
|
||||
MD_FLOAT x = atom_x(i);
|
||||
MD_FLOAT y = atom_y(i);
|
||||
MD_FLOAT z = atom_z(i);
|
||||
|
||||
/* Setup ghost atoms */
|
||||
/* 6 planes */
|
||||
if (x < Cutneigh) { ADDGHOST(+1, 0, 0); }
|
||||
if (x >= (xprd - Cutneigh)) { ADDGHOST(-1, 0, 0); }
|
||||
if (y < Cutneigh) { ADDGHOST(0, +1, 0); }
|
||||
if (y >= (yprd - Cutneigh)) { ADDGHOST(0, -1, 0); }
|
||||
if (z < Cutneigh) { ADDGHOST(0, 0, +1); }
|
||||
if (z >= (zprd - Cutneigh)) { ADDGHOST(0, 0, -1); }
|
||||
/* 8 corners */
|
||||
if (x < Cutneigh && y < Cutneigh && z < Cutneigh) { ADDGHOST(+1, +1, +1); }
|
||||
if (x < Cutneigh && y >= (yprd - Cutneigh) && z < Cutneigh) { ADDGHOST(+1, -1, +1); }
|
||||
if (x < Cutneigh && y >= Cutneigh && z >= (zprd - Cutneigh)) { ADDGHOST(+1, +1, -1); }
|
||||
if (x < Cutneigh && y >= (yprd - Cutneigh) && z >= (zprd - Cutneigh)) { ADDGHOST(+1, -1, -1); }
|
||||
if (x >= (xprd - Cutneigh) && y < Cutneigh && z < Cutneigh) { ADDGHOST(-1, +1, +1); }
|
||||
if (x >= (xprd - Cutneigh) && y >= (yprd - Cutneigh) && z < Cutneigh) { ADDGHOST(-1, -1, +1); }
|
||||
if (x >= (xprd - Cutneigh) && y < Cutneigh && z >= (zprd - Cutneigh)) { ADDGHOST(-1, +1, -1); }
|
||||
if (x >= (xprd - Cutneigh) && y >= (yprd - Cutneigh) && z >= (zprd - Cutneigh)) { ADDGHOST(-1, -1, -1); }
|
||||
/* 12 edges */
|
||||
if (x < Cutneigh && z < Cutneigh) { ADDGHOST(+1, 0, +1); }
|
||||
if (x < Cutneigh && z >= (zprd - Cutneigh)) { ADDGHOST(+1, 0, -1); }
|
||||
if (x >= (xprd - Cutneigh) && z < Cutneigh) { ADDGHOST(-1, 0, +1); }
|
||||
if (x >= (xprd - Cutneigh) && z >= (zprd - Cutneigh)) { ADDGHOST(-1, 0, -1); }
|
||||
if (y < Cutneigh && z < Cutneigh) { ADDGHOST(0, +1, +1); }
|
||||
if (y < Cutneigh && z >= (zprd - Cutneigh)) { ADDGHOST(0, +1, -1); }
|
||||
if (y >= (yprd - Cutneigh) && z < Cutneigh) { ADDGHOST(0, -1, +1); }
|
||||
if (y >= (yprd - Cutneigh) && z >= (zprd - Cutneigh)) { ADDGHOST(0, -1, -1); }
|
||||
if (y < Cutneigh && x < Cutneigh) { ADDGHOST(+1, +1, 0); }
|
||||
if (y < Cutneigh && x >= (xprd - Cutneigh)) { ADDGHOST(-1, +1, 0); }
|
||||
if (y >= (yprd - Cutneigh) && x < Cutneigh) { ADDGHOST(+1, -1, 0); }
|
||||
if (y >= (yprd - Cutneigh) && x >= (xprd - Cutneigh)) { ADDGHOST(-1, -1, 0); }
|
||||
}
|
||||
// increase by one to make it the ghost atom count
|
||||
atom->Nghost = Nghost + 1;
|
||||
}
|
||||
|
||||
/* internal subroutines */
|
||||
void growPbc(Atom *atom) {
|
||||
int nold = NmaxGhost;
|
||||
NmaxGhost += DELTA;
|
||||
|
||||
atom->border_map = (int *) reallocate(atom->border_map, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
|
||||
PBCx = (int *) reallocate(PBCx, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
|
||||
PBCy = (int *) reallocate(PBCy, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
|
||||
PBCz = (int *) reallocate(PBCz, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
|
||||
}
|
||||
}
|
@ -71,12 +71,9 @@ void setupThermo(Parameter *param, int natoms)
|
||||
void computeThermo(int iflag, Parameter *param, Atom *atom)
|
||||
{
|
||||
MD_FLOAT t = 0.0, p;
|
||||
MD_FLOAT* vx = atom->vx;
|
||||
MD_FLOAT* vy = atom->vy;
|
||||
MD_FLOAT* vz = atom->vz;
|
||||
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
t += (vx[i] * vx[i] + vy[i] * vy[i] + vz[i] * vz[i]) * param->mass;
|
||||
t += (atom_vx(i) * atom_vx(i) + atom_vy(i) * atom_vy(i) + atom_vz(i) * atom_vz(i)) * param->mass;
|
||||
}
|
||||
|
||||
t = t * t_scale;
|
||||
@ -101,12 +98,11 @@ void adjustThermo(Parameter *param, Atom *atom)
|
||||
{
|
||||
/* zero center-of-mass motion */
|
||||
MD_FLOAT vxtot = 0.0; MD_FLOAT vytot = 0.0; MD_FLOAT vztot = 0.0;
|
||||
MD_FLOAT* vx = atom->vx; MD_FLOAT* vy = atom->vy; MD_FLOAT* vz = atom->vz;
|
||||
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
vxtot += vx[i];
|
||||
vytot += vy[i];
|
||||
vztot += vz[i];
|
||||
vxtot += atom_vx(i);
|
||||
vytot += atom_vy(i);
|
||||
vztot += atom_vz(i);
|
||||
}
|
||||
|
||||
vxtot = vxtot / atom->Natoms;
|
||||
@ -114,24 +110,24 @@ void adjustThermo(Parameter *param, Atom *atom)
|
||||
vztot = vztot / atom->Natoms;
|
||||
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
vx[i] -= vxtot;
|
||||
vy[i] -= vytot;
|
||||
vz[i] -= vztot;
|
||||
atom_vx(i) -= vxtot;
|
||||
atom_vy(i) -= vytot;
|
||||
atom_vz(i) -= vztot;
|
||||
}
|
||||
|
||||
t_act = 0;
|
||||
MD_FLOAT t = 0.0;
|
||||
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
t += (vx[i] * vx[i] + vy[i] * vy[i] + vz[i] * vz[i]) * param->mass;
|
||||
t += (atom_vx(i) * atom_vx(i) + atom_vy(i) * atom_vy(i) + atom_vz(i) * atom_vz(i)) * param->mass;
|
||||
}
|
||||
|
||||
t *= t_scale;
|
||||
MD_FLOAT factor = sqrt(param->temp / t);
|
||||
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
vx[i] *= factor;
|
||||
vy[i] *= factor;
|
||||
vz[i] *= factor;
|
||||
atom_vx(i) *= factor;
|
||||
atom_vy(i) *= factor;
|
||||
atom_vz(i) *= factor;
|
||||
}
|
||||
}
|
106
src/force.c
106
src/force.c
@ -1,106 +0,0 @@
|
||||
/*
|
||||
* =======================================================================================
|
||||
*
|
||||
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
|
||||
* Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
|
||||
*
|
||||
* This file is part of MD-Bench.
|
||||
*
|
||||
* MD-Bench is free software: you can redistribute it and/or modify it
|
||||
* under the terms of the GNU Lesser General Public License as published
|
||||
* by the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
|
||||
* details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License along
|
||||
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
|
||||
* =======================================================================================
|
||||
*/
|
||||
#include <likwid-marker.h>
|
||||
|
||||
#include <timing.h>
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <atom.h>
|
||||
|
||||
double computeForce(
|
||||
Parameter *param,
|
||||
Atom *atom,
|
||||
Neighbor *neighbor
|
||||
)
|
||||
{
|
||||
int Nlocal = atom->Nlocal;
|
||||
int* neighs;
|
||||
MD_FLOAT* fx = atom->fx;
|
||||
MD_FLOAT* fy = atom->fy;
|
||||
MD_FLOAT* fz = atom->fz;
|
||||
#ifndef EXPLICIT_TYPES
|
||||
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
|
||||
MD_FLOAT sigma6 = param->sigma6;
|
||||
MD_FLOAT epsilon = param->epsilon;
|
||||
#endif
|
||||
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
fx[i] = 0.0;
|
||||
fy[i] = 0.0;
|
||||
fz[i] = 0.0;
|
||||
}
|
||||
|
||||
double S = getTimeStamp();
|
||||
LIKWID_MARKER_START("force");
|
||||
|
||||
#pragma omp parallel for
|
||||
for(int i = 0; i < Nlocal; i++) {
|
||||
neighs = &neighbor->neighbors[i * neighbor->maxneighs];
|
||||
int numneighs = neighbor->numneigh[i];
|
||||
MD_FLOAT xtmp = atom_x(i);
|
||||
MD_FLOAT ytmp = atom_y(i);
|
||||
MD_FLOAT ztmp = atom_z(i);
|
||||
MD_FLOAT fix = 0;
|
||||
MD_FLOAT fiy = 0;
|
||||
MD_FLOAT fiz = 0;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_i = atom->type[i];
|
||||
#endif
|
||||
|
||||
|
||||
for(int k = 0; k < numneighs; k++) {
|
||||
int j = neighs[k];
|
||||
MD_FLOAT delx = xtmp - atom_x(j);
|
||||
MD_FLOAT dely = ytmp - atom_y(j);
|
||||
MD_FLOAT delz = ztmp - atom_z(j);
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
const int type_j = atom->type[j];
|
||||
const int type_ij = type_i * atom->ntypes + type_j;
|
||||
const MD_FLOAT cutforcesq = atom->cutforcesq[type_ij];
|
||||
const MD_FLOAT sigma6 = atom->sigma6[type_ij];
|
||||
const MD_FLOAT epsilon = atom->epsilon[type_ij];
|
||||
#endif
|
||||
|
||||
if(rsq < cutforcesq) {
|
||||
MD_FLOAT sr2 = 1.0 / rsq;
|
||||
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
|
||||
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
|
||||
fix += delx * force;
|
||||
fiy += dely * force;
|
||||
fiz += delz * force;
|
||||
}
|
||||
}
|
||||
|
||||
fx[i] += fix;
|
||||
fy[i] += fiy;
|
||||
fz[i] += fiz;
|
||||
}
|
||||
|
||||
LIKWID_MARKER_STOP("force");
|
||||
double E = getTimeStamp();
|
||||
|
||||
return E-S;
|
||||
}
|
396
src/neighbor.c
396
src/neighbor.c
@ -1,396 +0,0 @@
|
||||
/*
|
||||
* =======================================================================================
|
||||
*
|
||||
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
|
||||
* Copyright (c) 2021 RRZE, University Erlangen-Nuremberg
|
||||
*
|
||||
* This file is part of MD-Bench.
|
||||
*
|
||||
* MD-Bench is free software: you can redistribute it and/or modify it
|
||||
* under the terms of the GNU Lesser General Public License as published
|
||||
* by the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
|
||||
* details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License along
|
||||
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
|
||||
* =======================================================================================
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
|
||||
#include <neighbor.h>
|
||||
#include <parameter.h>
|
||||
#include <atom.h>
|
||||
|
||||
#define SMALL 1.0e-6
|
||||
#define FACTOR 0.999
|
||||
|
||||
static MD_FLOAT xprd, yprd, zprd;
|
||||
static MD_FLOAT bininvx, bininvy, bininvz;
|
||||
static int mbinxlo, mbinylo, mbinzlo;
|
||||
static int nbinx, nbiny, nbinz;
|
||||
static int mbinx, mbiny, mbinz; // n bins in x, y, z
|
||||
static int *bincount;
|
||||
static int *bins;
|
||||
static int mbins; //total number of bins
|
||||
static int atoms_per_bin; // max atoms per bin
|
||||
static MD_FLOAT cutneigh;
|
||||
static MD_FLOAT cutneighsq; // neighbor cutoff squared
|
||||
static int nmax;
|
||||
static int nstencil; // # of bins in stencil
|
||||
static int* stencil; // stencil list of bin offsets
|
||||
static MD_FLOAT binsizex, binsizey, binsizez;
|
||||
|
||||
static int coord2bin(MD_FLOAT, MD_FLOAT , MD_FLOAT);
|
||||
static MD_FLOAT bindist(int, int, int);
|
||||
|
||||
/* exported subroutines */
|
||||
void initNeighbor(Neighbor *neighbor, Parameter *param)
|
||||
{
|
||||
MD_FLOAT neighscale = 5.0 / 6.0;
|
||||
xprd = param->nx * param->lattice;
|
||||
yprd = param->ny * param->lattice;
|
||||
zprd = param->nz * param->lattice;
|
||||
cutneigh = param->cutneigh;
|
||||
nbinx = neighscale * param->nx;
|
||||
nbiny = neighscale * param->ny;
|
||||
nbinz = neighscale * param->nz;
|
||||
nmax = 0;
|
||||
atoms_per_bin = 8;
|
||||
stencil = NULL;
|
||||
bins = NULL;
|
||||
bincount = NULL;
|
||||
neighbor->maxneighs = 100;
|
||||
neighbor->numneigh = NULL;
|
||||
neighbor->neighbors = NULL;
|
||||
}
|
||||
|
||||
void setupNeighbor()
|
||||
{
|
||||
MD_FLOAT coord;
|
||||
int mbinxhi, mbinyhi, mbinzhi;
|
||||
int nextx, nexty, nextz;
|
||||
MD_FLOAT xlo = 0.0; MD_FLOAT xhi = xprd;
|
||||
MD_FLOAT ylo = 0.0; MD_FLOAT yhi = yprd;
|
||||
MD_FLOAT zlo = 0.0; MD_FLOAT zhi = zprd;
|
||||
|
||||
cutneighsq = cutneigh * cutneigh;
|
||||
binsizex = xprd / nbinx;
|
||||
binsizey = yprd / nbiny;
|
||||
binsizez = zprd / nbinz;
|
||||
bininvx = 1.0 / binsizex;
|
||||
bininvy = 1.0 / binsizey;
|
||||
bininvz = 1.0 / binsizez;
|
||||
|
||||
coord = xlo - cutneigh - SMALL * xprd;
|
||||
mbinxlo = (int) (coord * bininvx);
|
||||
if (coord < 0.0) {
|
||||
mbinxlo = mbinxlo - 1;
|
||||
}
|
||||
coord = xhi + cutneigh + SMALL * xprd;
|
||||
mbinxhi = (int) (coord * bininvx);
|
||||
|
||||
coord = ylo - cutneigh - SMALL * yprd;
|
||||
mbinylo = (int) (coord * bininvy);
|
||||
if (coord < 0.0) {
|
||||
mbinylo = mbinylo - 1;
|
||||
}
|
||||
coord = yhi + cutneigh + SMALL * yprd;
|
||||
mbinyhi = (int) (coord * bininvy);
|
||||
|
||||
coord = zlo - cutneigh - SMALL * zprd;
|
||||
mbinzlo = (int) (coord * bininvz);
|
||||
if (coord < 0.0) {
|
||||
mbinzlo = mbinzlo - 1;
|
||||
}
|
||||
coord = zhi + cutneigh + SMALL * zprd;
|
||||
mbinzhi = (int) (coord * bininvz);
|
||||
|
||||
mbinxlo = mbinxlo - 1;
|
||||
mbinxhi = mbinxhi + 1;
|
||||
mbinx = mbinxhi - mbinxlo + 1;
|
||||
|
||||
mbinylo = mbinylo - 1;
|
||||
mbinyhi = mbinyhi + 1;
|
||||
mbiny = mbinyhi - mbinylo + 1;
|
||||
|
||||
mbinzlo = mbinzlo - 1;
|
||||
mbinzhi = mbinzhi + 1;
|
||||
mbinz = mbinzhi - mbinzlo + 1;
|
||||
|
||||
nextx = (int) (cutneigh * bininvx);
|
||||
if(nextx * binsizex < FACTOR * cutneigh) nextx++;
|
||||
|
||||
nexty = (int) (cutneigh * bininvy);
|
||||
if(nexty * binsizey < FACTOR * cutneigh) nexty++;
|
||||
|
||||
nextz = (int) (cutneigh * bininvz);
|
||||
if(nextz * binsizez < FACTOR * cutneigh) nextz++;
|
||||
|
||||
if (stencil) {
|
||||
free(stencil);
|
||||
}
|
||||
|
||||
stencil = (int*) malloc(
|
||||
(2 * nextz + 1) * (2 * nexty + 1) * (2 * nextx + 1) * sizeof(int));
|
||||
|
||||
nstencil = 0;
|
||||
int kstart = -nextz;
|
||||
|
||||
for(int k = kstart; k <= nextz; k++) {
|
||||
for(int j = -nexty; j <= nexty; j++) {
|
||||
for(int i = -nextx; i <= nextx; i++) {
|
||||
if(bindist(i, j, k) < cutneighsq) {
|
||||
stencil[nstencil++] =
|
||||
k * mbiny * mbinx + j * mbinx + i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mbins = mbinx * mbiny * mbinz;
|
||||
|
||||
if (bincount) {
|
||||
free(bincount);
|
||||
}
|
||||
bincount = (int*) malloc(mbins * sizeof(int));
|
||||
|
||||
if (bins) {
|
||||
free(bins);
|
||||
}
|
||||
bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int));
|
||||
}
|
||||
|
||||
void buildNeighbor(Atom *atom, Neighbor *neighbor)
|
||||
{
|
||||
int nall = atom->Nlocal + atom->Nghost;
|
||||
|
||||
/* extend atom arrays if necessary */
|
||||
if(nall > nmax) {
|
||||
nmax = nall;
|
||||
if(neighbor->numneigh) free(neighbor->numneigh);
|
||||
if(neighbor->neighbors) free(neighbor->neighbors);
|
||||
neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
|
||||
neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*));
|
||||
}
|
||||
|
||||
/* bin local & ghost atoms */
|
||||
binatoms(atom);
|
||||
int resize = 1;
|
||||
|
||||
/* loop over each atom, storing neighbors */
|
||||
while(resize) {
|
||||
int new_maxneighs = neighbor->maxneighs;
|
||||
resize = 0;
|
||||
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
int* neighptr = &(neighbor->neighbors[i * neighbor->maxneighs]);
|
||||
int n = 0;
|
||||
MD_FLOAT xtmp = atom_x(i);
|
||||
MD_FLOAT ytmp = atom_y(i);
|
||||
MD_FLOAT ztmp = atom_z(i);
|
||||
int ibin = coord2bin(xtmp, ytmp, ztmp);
|
||||
#ifdef EXPLICIT_TYPES
|
||||
int type_i = atom->type[i];
|
||||
#endif
|
||||
for(int k = 0; k < nstencil; k++) {
|
||||
int jbin = ibin + stencil[k];
|
||||
int* loc_bin = &bins[jbin * atoms_per_bin];
|
||||
|
||||
for(int m = 0; m < bincount[jbin]; m++) {
|
||||
int j = loc_bin[m];
|
||||
|
||||
if ( j == i ){
|
||||
continue;
|
||||
}
|
||||
|
||||
MD_FLOAT delx = xtmp - atom_x(j);
|
||||
MD_FLOAT dely = ytmp - atom_y(j);
|
||||
MD_FLOAT delz = ztmp - atom_z(j);
|
||||
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
|
||||
|
||||
#ifdef EXPLICIT_TYPES
|
||||
int type_j = atom->type[j];
|
||||
const MD_FLOAT cutoff = atom->cutneighsq[type_i * atom->ntypes + type_j];
|
||||
#else
|
||||
const MD_FLOAT cutoff = cutneighsq;
|
||||
#endif
|
||||
if( rsq <= cutoff ) {
|
||||
neighptr[n++] = j;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
neighbor->numneigh[i] = n;
|
||||
|
||||
if(n >= neighbor->maxneighs) {
|
||||
resize = 1;
|
||||
|
||||
if(n >= new_maxneighs) {
|
||||
new_maxneighs = n;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(resize) {
|
||||
printf("RESIZE %d\n", neighbor->maxneighs);
|
||||
neighbor->maxneighs = new_maxneighs * 1.2;
|
||||
free(neighbor->neighbors);
|
||||
neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* internal subroutines */
|
||||
MD_FLOAT bindist(int i, int j, int k)
|
||||
{
|
||||
MD_FLOAT delx, dely, delz;
|
||||
|
||||
if(i > 0) {
|
||||
delx = (i - 1) * binsizex;
|
||||
} else if(i == 0) {
|
||||
delx = 0.0;
|
||||
} else {
|
||||
delx = (i + 1) * binsizex;
|
||||
}
|
||||
|
||||
if(j > 0) {
|
||||
dely = (j - 1) * binsizey;
|
||||
} else if(j == 0) {
|
||||
dely = 0.0;
|
||||
} else {
|
||||
dely = (j + 1) * binsizey;
|
||||
}
|
||||
|
||||
if(k > 0) {
|
||||
delz = (k - 1) * binsizez;
|
||||
} else if(k == 0) {
|
||||
delz = 0.0;
|
||||
} else {
|
||||
delz = (k + 1) * binsizez;
|
||||
}
|
||||
|
||||
return (delx * delx + dely * dely + delz * delz);
|
||||
}
|
||||
|
||||
int coord2bin(MD_FLOAT xin, MD_FLOAT yin, MD_FLOAT zin)
|
||||
{
|
||||
int ix, iy, iz;
|
||||
|
||||
if(xin >= xprd) {
|
||||
ix = (int)((xin - xprd) * bininvx) + nbinx - mbinxlo;
|
||||
} else if(xin >= 0.0) {
|
||||
ix = (int)(xin * bininvx) - mbinxlo;
|
||||
} else {
|
||||
ix = (int)(xin * bininvx) - mbinxlo - 1;
|
||||
}
|
||||
|
||||
if(yin >= yprd) {
|
||||
iy = (int)((yin - yprd) * bininvy) + nbiny - mbinylo;
|
||||
} else if(yin >= 0.0) {
|
||||
iy = (int)(yin * bininvy) - mbinylo;
|
||||
} else {
|
||||
iy = (int)(yin * bininvy) - mbinylo - 1;
|
||||
}
|
||||
|
||||
if(zin >= zprd) {
|
||||
iz = (int)((zin - zprd) * bininvz) + nbinz - mbinzlo;
|
||||
} else if(zin >= 0.0) {
|
||||
iz = (int)(zin * bininvz) - mbinzlo;
|
||||
} else {
|
||||
iz = (int)(zin * bininvz) - mbinzlo - 1;
|
||||
}
|
||||
|
||||
return (iz * mbiny * mbinx + iy * mbinx + ix + 1);
|
||||
}
|
||||
|
||||
void binatoms(Atom *atom)
|
||||
{
|
||||
int nall = atom->Nlocal + atom->Nghost;
|
||||
int resize = 1;
|
||||
|
||||
while(resize > 0) {
|
||||
resize = 0;
|
||||
|
||||
for(int i = 0; i < mbins; i++) {
|
||||
bincount[i] = 0;
|
||||
}
|
||||
|
||||
for(int i = 0; i < nall; i++) {
|
||||
int ibin = coord2bin(atom_x(i), atom_y(i), atom_z(i));
|
||||
|
||||
if(bincount[ibin] < atoms_per_bin) {
|
||||
int ac = bincount[ibin]++;
|
||||
bins[ibin * atoms_per_bin + ac] = i;
|
||||
} else {
|
||||
resize = 1;
|
||||
}
|
||||
}
|
||||
|
||||
if(resize) {
|
||||
free(bins);
|
||||
atoms_per_bin *= 2;
|
||||
bins = (int*) malloc(mbins * atoms_per_bin * sizeof(int));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void sortAtom(Atom* atom) {
|
||||
binatoms(atom);
|
||||
int Nmax = atom->Nmax;
|
||||
int* binpos = bincount;
|
||||
|
||||
for(int i=1; i<mbins; i++) {
|
||||
binpos[i] += binpos[i-1];
|
||||
}
|
||||
|
||||
#ifdef AOS
|
||||
double* new_x = (double*) malloc(Nmax * sizeof(MD_FLOAT) * 3);
|
||||
#else
|
||||
double* new_x = (double*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||
double* new_y = (double*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||
double* new_z = (double*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||
#endif
|
||||
double* new_vx = (double*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||
double* new_vy = (double*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||
double* new_vz = (double*) malloc(Nmax * sizeof(MD_FLOAT));
|
||||
double* old_x = atom->x; double* old_y = atom->y; double* old_z = atom->z;
|
||||
double* old_vx = atom->vx; double* old_vy = atom->vy; double* old_vz = atom->vz;
|
||||
|
||||
for(int mybin = 0; mybin<mbins; mybin++) {
|
||||
int start = mybin>0?binpos[mybin-1]:0;
|
||||
int count = binpos[mybin] - start;
|
||||
for(int k=0; k<count; k++) {
|
||||
int new_i = start + k;
|
||||
int old_i = bins[mybin * atoms_per_bin + k];
|
||||
#ifdef AOS
|
||||
new_x[new_i * 3 + 0] = old_x[old_i * 3 + 0];
|
||||
new_x[new_i * 3 + 1] = old_x[old_i * 3 + 1];
|
||||
new_x[new_i * 3 + 2] = old_x[old_i * 3 + 2];
|
||||
#else
|
||||
new_x[new_i] = old_x[old_i];
|
||||
new_y[new_i] = old_y[old_i];
|
||||
new_z[new_i] = old_z[old_i];
|
||||
#endif
|
||||
new_vx[new_i] = old_vx[old_i];
|
||||
new_vy[new_i] = old_vy[old_i];
|
||||
new_vz[new_i] = old_vz[old_i];
|
||||
}
|
||||
}
|
||||
|
||||
free(atom->x);
|
||||
atom->x = new_x;
|
||||
#ifndef AOS
|
||||
free(atom->y);
|
||||
free(atom->z);
|
||||
atom->y = new_y; atom->z = new_z;
|
||||
#endif
|
||||
free(atom->vx); free(atom->vy); free(atom->vz);
|
||||
atom->vx = new_vx; atom->vy = new_vy; atom->vz = new_vz;
|
||||
}
|
172
src/pbc.c
172
src/pbc.c
@ -1,172 +0,0 @@
|
||||
/*
|
||||
* =======================================================================================
|
||||
*
|
||||
* Author: Jan Eitzinger (je), jan.eitzinger@fau.de
|
||||
* Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
|
||||
*
|
||||
* This file is part of MD-Bench.
|
||||
*
|
||||
* MD-Bench is free software: you can redistribute it and/or modify it
|
||||
* under the terms of the GNU Lesser General Public License as published
|
||||
* by the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY
|
||||
* WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
* PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
|
||||
* details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License along
|
||||
* with MD-Bench. If not, see <https://www.gnu.org/licenses/>.
|
||||
* =======================================================================================
|
||||
*/
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <pbc.h>
|
||||
#include <atom.h>
|
||||
#include <allocate.h>
|
||||
|
||||
#define DELTA 20000
|
||||
|
||||
static int NmaxGhost;
|
||||
static int *PBCx, *PBCy, *PBCz;
|
||||
|
||||
static void growPbc(Atom*);
|
||||
|
||||
/* exported subroutines */
|
||||
void initPbc(Atom* atom)
|
||||
{
|
||||
NmaxGhost = 0;
|
||||
atom->border_map = NULL;
|
||||
PBCx = NULL; PBCy = NULL; PBCz = NULL;
|
||||
}
|
||||
|
||||
/* update coordinates of ghost atoms */
|
||||
/* uses mapping created in setupPbc */
|
||||
void updatePbc(Atom *atom, Parameter *param)
|
||||
{
|
||||
int *border_map = atom->border_map;
|
||||
int nlocal = atom->Nlocal;
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
|
||||
for(int i = 0; i < atom->Nghost; i++) {
|
||||
atom_x(nlocal + i) = atom_x(border_map[i]) + PBCx[i] * xprd;
|
||||
atom_y(nlocal + i) = atom_y(border_map[i]) + PBCy[i] * yprd;
|
||||
atom_z(nlocal + i) = atom_z(border_map[i]) + PBCz[i] * zprd;
|
||||
}
|
||||
}
|
||||
|
||||
/* relocate atoms that have left domain according
|
||||
* to periodic boundary conditions */
|
||||
void updateAtomsPbc(Atom *atom, Parameter *param)
|
||||
{
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
|
||||
if(atom_x(i) < 0.0) {
|
||||
atom_x(i) += xprd;
|
||||
} else if(atom_x(i) >= xprd) {
|
||||
atom_x(i) -= xprd;
|
||||
}
|
||||
|
||||
if(atom_y(i) < 0.0) {
|
||||
atom_y(i) += yprd;
|
||||
} else if(atom_y(i) >= yprd) {
|
||||
atom_y(i) -= yprd;
|
||||
}
|
||||
|
||||
if(atom_z(i) < 0.0) {
|
||||
atom_z(i) += zprd;
|
||||
} else if(atom_z(i) >= zprd) {
|
||||
atom_z(i) -= zprd;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* setup periodic boundary conditions by
|
||||
* defining ghost atoms around domain
|
||||
* only creates mapping and coordinate corrections
|
||||
* that are then enforced in updatePbc */
|
||||
#define ADDGHOST(dx,dy,dz) \
|
||||
Nghost++; \
|
||||
border_map[Nghost] = i; \
|
||||
PBCx[Nghost] = dx; \
|
||||
PBCy[Nghost] = dy; \
|
||||
PBCz[Nghost] = dz; \
|
||||
atom->type[atom->Nlocal + Nghost] = atom->type[i]
|
||||
|
||||
void setupPbc(Atom *atom, Parameter *param)
|
||||
{
|
||||
int *border_map = atom->border_map;
|
||||
MD_FLOAT xprd = param->xprd;
|
||||
MD_FLOAT yprd = param->yprd;
|
||||
MD_FLOAT zprd = param->zprd;
|
||||
MD_FLOAT Cutneigh = param->cutneigh;
|
||||
int Nghost = -1;
|
||||
|
||||
for(int i = 0; i < atom->Nlocal; i++) {
|
||||
|
||||
if (atom->Nlocal + Nghost + 7 >= atom->Nmax) {
|
||||
growAtom(atom);
|
||||
}
|
||||
if (Nghost + 7 >= NmaxGhost) {
|
||||
growPbc(atom);
|
||||
border_map = atom->border_map;
|
||||
}
|
||||
|
||||
MD_FLOAT x = atom_x(i);
|
||||
MD_FLOAT y = atom_y(i);
|
||||
MD_FLOAT z = atom_z(i);
|
||||
|
||||
/* Setup ghost atoms */
|
||||
/* 6 planes */
|
||||
if (x < Cutneigh) { ADDGHOST(+1,0,0); }
|
||||
if (x >= (xprd-Cutneigh)) { ADDGHOST(-1,0,0); }
|
||||
if (y < Cutneigh) { ADDGHOST(0,+1,0); }
|
||||
if (y >= (yprd-Cutneigh)) { ADDGHOST(0,-1,0); }
|
||||
if (z < Cutneigh) { ADDGHOST(0,0,+1); }
|
||||
if (z >= (zprd-Cutneigh)) { ADDGHOST(0,0,-1); }
|
||||
/* 8 corners */
|
||||
if (x < Cutneigh && y < Cutneigh && z < Cutneigh) { ADDGHOST(+1,+1,+1); }
|
||||
if (x < Cutneigh && y >= (yprd-Cutneigh) && z < Cutneigh) { ADDGHOST(+1,-1,+1); }
|
||||
if (x < Cutneigh && y >= Cutneigh && z >= (zprd-Cutneigh)) { ADDGHOST(+1,+1,-1); }
|
||||
if (x < Cutneigh && y >= (yprd-Cutneigh) && z >= (zprd-Cutneigh)) { ADDGHOST(+1,-1,-1); }
|
||||
if (x >= (xprd-Cutneigh) && y < Cutneigh && z < Cutneigh) { ADDGHOST(-1,+1,+1); }
|
||||
if (x >= (xprd-Cutneigh) && y >= (yprd-Cutneigh) && z < Cutneigh) { ADDGHOST(-1,-1,+1); }
|
||||
if (x >= (xprd-Cutneigh) && y < Cutneigh && z >= (zprd-Cutneigh)) { ADDGHOST(-1,+1,-1); }
|
||||
if (x >= (xprd-Cutneigh) && y >= (yprd-Cutneigh) && z >= (zprd-Cutneigh)) { ADDGHOST(-1,-1,-1); }
|
||||
/* 12 edges */
|
||||
if (x < Cutneigh && z < Cutneigh) { ADDGHOST(+1,0,+1); }
|
||||
if (x < Cutneigh && z >= (zprd-Cutneigh)) { ADDGHOST(+1,0,-1); }
|
||||
if (x >= (xprd-Cutneigh) && z < Cutneigh) { ADDGHOST(-1,0,+1); }
|
||||
if (x >= (xprd-Cutneigh) && z >= (zprd-Cutneigh)) { ADDGHOST(-1,0,-1); }
|
||||
if (y < Cutneigh && z < Cutneigh) { ADDGHOST(0,+1,+1); }
|
||||
if (y < Cutneigh && z >= (zprd-Cutneigh)) { ADDGHOST(0,+1,-1); }
|
||||
if (y >= (yprd-Cutneigh) && z < Cutneigh) { ADDGHOST(0,-1,+1); }
|
||||
if (y >= (yprd-Cutneigh) && z >= (zprd-Cutneigh)) { ADDGHOST(0,-1,-1); }
|
||||
if (y < Cutneigh && x < Cutneigh) { ADDGHOST(+1,+1,0); }
|
||||
if (y < Cutneigh && x >= (xprd-Cutneigh)) { ADDGHOST(-1,+1,0); }
|
||||
if (y >= (yprd-Cutneigh) && x < Cutneigh) { ADDGHOST(+1,-1,0); }
|
||||
if (y >= (yprd-Cutneigh) && x >= (xprd-Cutneigh)) { ADDGHOST(-1,-1,0); }
|
||||
}
|
||||
// increase by one to make it the ghost atom count
|
||||
atom->Nghost = Nghost + 1;
|
||||
}
|
||||
|
||||
/* internal subroutines */
|
||||
void growPbc(Atom* atom)
|
||||
{
|
||||
int nold = NmaxGhost;
|
||||
NmaxGhost += DELTA;
|
||||
|
||||
atom->border_map = (int*) reallocate(atom->border_map, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
|
||||
PBCx = (int*) reallocate(PBCx, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
|
||||
PBCy = (int*) reallocate(PBCy, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
|
||||
PBCz = (int*) reallocate(PBCz, ALIGNMENT, NmaxGhost * sizeof(int), nold * sizeof(int));
|
||||
}
|
Loading…
Reference in New Issue
Block a user