From ee3f6de050248160c5b00ac5092aa145897584e4 Mon Sep 17 00:00:00 2001 From: Andropov Arsenii Date: Sun, 18 Dec 2022 14:28:29 +0100 Subject: [PATCH] Building of super clusters complete, force computation kernel WIP --- Makefile | 4 + config.mk | 3 +- gromacs/atom.c | 40 ++++ gromacs/cuda/force_lj.cu | 95 +++++++- gromacs/cuda/force_lj_sup.cu | 290 +++++++++++++++++++++++ gromacs/includes/atom.h | 73 ++++++ gromacs/includes/neighbor.h | 1 + gromacs/includes/utils.h | 17 ++ gromacs/includes/vtk.h | 1 + gromacs/main.c | 34 ++- gromacs/neighbor.c | 439 +++++++++++++++++++++++++++++++++++ gromacs/utils.c | 277 ++++++++++++++++++++++ gromacs/vtk.c | 53 +++++ include_NVCC.mk | 3 +- 14 files changed, 1326 insertions(+), 4 deletions(-) create mode 100644 gromacs/cuda/force_lj_sup.cu create mode 100644 gromacs/includes/utils.h create mode 100644 gromacs/utils.c diff --git a/Makefile b/Makefile index 76bf162..f8fd47d 100644 --- a/Makefile +++ b/Makefile @@ -97,6 +97,10 @@ ifeq ($(strip $(USE_SIMD_KERNEL)),true) DEFINES += -DUSE_SIMD_KERNEL endif +ifeq ($(strip $(USE_SUPER_CLUSTERS)),true) + DEFINES += -DUSE_SUPER_CLUSTERS +endif + VPATH = $(SRC_DIR) $(ASM_DIR) $(CUDA_DIR) ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c)) OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s)) diff --git a/config.mk b/config.mk index 27b8a43..7e5f72a 100644 --- a/config.mk +++ b/config.mk @@ -1,5 +1,5 @@ # Compiler tag (GCC/CLANG/ICC/ICX/ONEAPI/NVCC) -TAG ?= ICC +TAG ?= NVCC # Instruction set (SSE/AVX/AVX_FMA/AVX2/AVX512) ISA ?= AVX512 # Optimization scheme (lammps/gromacs/clusters_per_bin) @@ -41,6 +41,7 @@ HALF_NEIGHBOR_LISTS_CHECK_CJ ?= false # Configurations for CUDA # Use CUDA host memory to optimize transfers USE_CUDA_HOST_MEMORY ?= false +USE_SUPER_CLUSTERS ?= true #Feature options OPTIONS = -DALIGNMENT=64 diff --git a/gromacs/atom.c b/gromacs/atom.c index 3f45116..047775e 100644 --- a/gromacs/atom.c +++ b/gromacs/atom.c @@ -37,6 +37,24 @@ void initAtom(Atom *atom) { atom->iclusters = NULL; atom->jclusters = NULL; atom->icluster_bin = NULL; + +#ifdef USE_SUPER_CLUSTERS + atom->scl_x = NULL; + atom->scl_v = NULL; + atom->scl_f = NULL; + + atom->Nsclusters = 0; + atom->Nsclusters_local = 0; + atom->Nsclusters_ghost = 0; + atom->Nsclusters_max = 0; + + atom->scl_type = NULL; + + atom->siclusters = NULL; + atom->icluster_idx = NULL; + + atom->sicluster_bin = NULL; +#endif //USE_SUPER_CLUSTERS } void createAtom(Atom *atom, Parameter *param) { @@ -421,3 +439,25 @@ void growClusters(Atom *atom) { atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT)); atom->cl_type = (int*) reallocate(atom->cl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * sizeof(int), nold * CLUSTER_M * sizeof(int)); } + +#ifdef USE_SUPER_CLUSTERS +void growSuperClusters(Atom *atom) { + int nold = atom->Nsclusters_max; + atom->Nsclusters_max += DELTA; + atom->siclusters = (SuperCluster*) reallocate(atom->siclusters, ALIGNMENT, atom->Nsclusters_max * sizeof(SuperCluster), nold * sizeof(SuperCluster)); + atom->icluster_idx = (int*) reallocate(atom->icluster_idx, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int), nold * SCLUSTER_SIZE * sizeof(int)); + atom->sicluster_bin = (int*) reallocate(atom->sicluster_bin, ALIGNMENT, atom->Nsclusters_max * sizeof(int), nold * sizeof(int)); + //atom->scl_type = (int*) reallocate(atom->scl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * SCLUSTER_SIZE * sizeof(int), nold * CLUSTER_M * SCLUSTER_SIZE * sizeof(int)); + + atom->scl_x = (MD_FLOAT*) reallocate(atom->scl_x, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT), nold * SCLUSTER_M * 3 * sizeof(MD_FLOAT)); + atom->scl_f = (MD_FLOAT*) reallocate(atom->scl_f, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT), nold * SCLUSTER_M * 3 * sizeof(MD_FLOAT)); + atom->scl_v = (MD_FLOAT*) reallocate(atom->scl_v, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT), nold * SCLUSTER_M * 3 * sizeof(MD_FLOAT)); + + + /* + for (int sci = 0; sci < atom->Nsclusters_max; sci++) { + atom->siclusters[sci].iclusters = (int*) reallocate(atom->siclusters[sci].iclusters, ALIGNMENT, SCLUSTER_SIZE * sizeof(int), SCLUSTER_SIZE * sizeof(int)); + } + */ +} +#endif //USE_SUPER_CLUSTERS diff --git a/gromacs/cuda/force_lj.cu b/gromacs/cuda/force_lj.cu index 1dcbde5..2b7d449 100644 --- a/gromacs/cuda/force_lj.cu +++ b/gromacs/cuda/force_lj.cu @@ -39,8 +39,29 @@ extern "C" { MD_FLOAT *cuda_bbminz, *cuda_bbmaxz; int *cuda_PBCx, *cuda_PBCy, *cuda_PBCz; int isReneighboured; + + int *cuda_iclusters; + int *cuda_nclusters; + + int cuda_max_scl; + MD_FLOAT *cuda_scl_x; + MD_FLOAT *cuda_scl_v; + MD_FLOAT *cuda_scl_f; + + extern void alignDataToSuperclusters(Atom *atom); + extern void alignDataFromSuperclusters(Atom *atom); + extern double computeForceLJSup_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats); } +extern __global__ void cudaInitialIntegrateSup_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f, + int *cuda_nclusters, + int *cuda_natoms, + int Nsclusters_local, MD_FLOAT dtforce, MD_FLOAT dt); + +extern __global__ void cudaFinalIntegrateSup_warp(MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f, + int *cuda_nclusters, int *cuda_natoms, + int Nsclusters_local, MD_FLOAT dtforce); + extern "C" void initDevice(Atom *atom, Neighbor *neighbor) { cuda_assert("cudaDeviceSetup", cudaDeviceReset()); @@ -59,10 +80,23 @@ void initDevice(Atom *atom, Neighbor *neighbor) { natoms = (int *) malloc(atom->Nclusters_max * sizeof(int)); ngatoms = (int *) malloc(atom->Nclusters_max * sizeof(int)); isReneighboured = 1; + +#ifdef USE_SUPER_CLUSTERS + cuda_max_scl = atom->Nsclusters_max; + cuda_iclusters = (int *) allocateGPU(atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int)); + cuda_nclusters = (int *) allocateGPU(atom->Nsclusters_max * sizeof(int)); + + cuda_scl_x = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT)); + cuda_scl_v = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT)); + cuda_scl_f = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT)); + +#endif //USE_SUPER_CLUSTERS } extern "C" void copyDataToCUDADevice(Atom *atom) { + DEBUG_MESSAGE("copyDataToCUDADevice start\r\n"); + memcpyToGPU(cuda_cl_x, atom->cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT)); memcpyToGPU(cuda_cl_v, atom->cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT)); memcpyToGPU(cuda_cl_f, atom->cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT)); @@ -85,13 +119,49 @@ void copyDataToCUDADevice(Atom *atom) { memcpyToGPU(cuda_PBCx, atom->PBCx, atom->Nclusters_ghost * sizeof(int)); memcpyToGPU(cuda_PBCy, atom->PBCy, atom->Nclusters_ghost * sizeof(int)); memcpyToGPU(cuda_PBCz, atom->PBCz, atom->Nclusters_ghost * sizeof(int)); + +#ifdef USE_SUPER_CLUSTERS + alignDataToSuperclusters(atom); + + if (cuda_max_scl < atom->Nsclusters_max) { + cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_x)); + cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_v)); + cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_f)); + cuda_max_scl = atom->Nsclusters_max; + + cuda_iclusters = (int *) allocateGPU(atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int)); + cuda_nclusters = (int *) allocateGPU(atom->Nsclusters_max * sizeof(int)); + + cuda_scl_x = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT)); + cuda_scl_v = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT)); + cuda_scl_f = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT)); + } + memcpyToGPU(cuda_scl_x, atom->scl_x, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT)); + memcpyToGPU(cuda_scl_v, atom->scl_v, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT)); + memcpyToGPU(cuda_scl_f, atom->scl_f, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT)); +#endif //USE_SUPER_CLUSTERS + + DEBUG_MESSAGE("copyDataToCUDADevice stop\r\n"); + } extern "C" void copyDataFromCUDADevice(Atom *atom) { + DEBUG_MESSAGE("copyDataFromCUDADevice start\r\n"); + memcpyFromGPU(atom->cl_x, cuda_cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT)); memcpyFromGPU(atom->cl_v, cuda_cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT)); memcpyFromGPU(atom->cl_f, cuda_cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT)); + +#ifdef USE_SUPER_CLUSTERS + alignDataFromSuperclusters(atom); + + memcpyFromGPU(atom->scl_x, cuda_scl_x, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT)); + memcpyFromGPU(atom->scl_v, cuda_scl_v, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT)); + memcpyFromGPU(atom->scl_f, cuda_scl_f, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT)); +#endif //USE_SUPER_CLUSTERS + + DEBUG_MESSAGE("copyDataFromCUDADevice stop\r\n"); } extern "C" @@ -109,6 +179,12 @@ void cudaDeviceFree() { cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCz)); free(natoms); free(ngatoms); + +#ifdef USE_SUPER_CLUSTERS + cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_x)); + cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_v)); + cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_f)); +#endif //USE_SUPER_CLUSTERS } __global__ void cudaInitialIntegrate_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f, @@ -251,9 +327,17 @@ extern "C" void cudaInitialIntegrate(Parameter *param, Atom *atom) { const int threads_num = 16; dim3 block_size = dim3(threads_num, 1, 1); + + #ifdef USE_SUPER_CLUSTERS + dim3 grid_size = dim3(atom->Nsclusters_local/(threads_num)+1, 1, 1); + cudaInitialIntegrateSup_warp<<>>(cuda_scl_x, cuda_scl_v, cuda_scl_f, + cuda_nclusters, + cuda_natoms, atom->Nsclusters_local, param->dtforce, param->dt); + #else dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1); cudaInitialIntegrate_warp<<>>(cuda_cl_x, cuda_cl_v, cuda_cl_f, cuda_natoms, atom->Nclusters_local, param->dtforce, param->dt); + #endif //USE_SUPER_CLUSTERS cuda_assert("cudaInitialIntegrate", cudaPeekAtLastError()); cuda_assert("cudaInitialIntegrate", cudaDeviceSynchronize()); } @@ -310,8 +394,17 @@ extern "C" void cudaFinalIntegrate(Parameter *param, Atom *atom) { const int threads_num = 16; dim3 block_size = dim3(threads_num, 1, 1); + + #ifdef USE_SUPER_CLUSTERS + dim3 grid_size = dim3(atom->Nsclusters_local/(threads_num)+1, 1, 1); + cudaFinalIntegrateSup_warp<<>>(cuda_scl_v, cuda_scl_f, + cuda_nclusters, cuda_natoms, + atom->Nsclusters_local, param->dt); + #else dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1); - cudaFinalIntegrate_warp<<>>(cuda_cl_v, cuda_cl_f, cuda_natoms, atom->Nclusters_local, param->dt); + cudaFinalIntegrate_warp<<>>(cuda_cl_v, cuda_cl_f, cuda_natoms, + atom->Nclusters_local, param->dt); + #endif //USE_SUPER_CLUSTERS cuda_assert("cudaFinalIntegrate", cudaPeekAtLastError()); cuda_assert("cudaFinalIntegrate", cudaDeviceSynchronize()); } diff --git a/gromacs/cuda/force_lj_sup.cu b/gromacs/cuda/force_lj_sup.cu new file mode 100644 index 0000000..23cf486 --- /dev/null +++ b/gromacs/cuda/force_lj_sup.cu @@ -0,0 +1,290 @@ + +extern "C" { + +#include +//--- +#include +#include +//--- +#include +//--- +#include +#include +#include +#include +#include +#include +#include + +} + +extern "C" { + extern MD_FLOAT *cuda_cl_x; + extern MD_FLOAT *cuda_cl_v; + extern MD_FLOAT *cuda_cl_f; + extern int *cuda_neighbors; + extern int *cuda_numneigh; + extern int *cuda_natoms; + extern int *natoms; + extern int *ngatoms; + extern int *cuda_border_map; + extern int *cuda_jclusters_natoms; + extern MD_FLOAT *cuda_bbminx, *cuda_bbmaxx; + extern MD_FLOAT *cuda_bbminy, *cuda_bbmaxy; + extern MD_FLOAT *cuda_bbminz, *cuda_bbmaxz; + extern int *cuda_PBCx, *cuda_PBCy, *cuda_PBCz; + extern int isReneighboured; + + extern int *cuda_iclusters; + extern int *cuda_nclusters; + + extern MD_FLOAT *cuda_scl_x; + extern MD_FLOAT *cuda_scl_v; + extern MD_FLOAT *cuda_scl_f; +} + +#ifdef USE_SUPER_CLUSTERS +extern "C" +void alignDataToSuperclusters(Atom *atom) { + for (int sci = 0; sci < atom->Nsclusters_local; sci++) { + const unsigned int scl_offset = sci * SCLUSTER_SIZE * 3 * CLUSTER_M; + + for (int ci = 0, scci = scl_offset; ci < atom->siclusters[sci].nclusters; ci++, scci += CLUSTER_M) { + + MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])]; + MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])]; + MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])]; + + /* + MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])]; + MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])]; + MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])]; + */ + + memcpy(&atom->scl_x[scci], &ci_x[0], CLUSTER_M * sizeof(MD_FLOAT)); + memcpy(&atom->scl_x[scci + SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT)); + memcpy(&atom->scl_x[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT)); + + memcpy(&atom->scl_v[scci], &ci_v[0], CLUSTER_M * sizeof(MD_FLOAT)); + memcpy(&atom->scl_v[scci + SCLUSTER_SIZE * CLUSTER_M], &ci_v[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT)); + memcpy(&atom->scl_v[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_v[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT)); + + memcpy(&atom->scl_f[scci], &ci_f[0], CLUSTER_M * sizeof(MD_FLOAT)); + memcpy(&atom->scl_f[scci + SCLUSTER_SIZE * CLUSTER_M], &ci_f[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT)); + memcpy(&atom->scl_f[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_f[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT)); + + } + } +} + +extern "C" +void alignDataFromSuperclusters(Atom *atom) { + for (int sci = 0; sci < atom->Nsclusters_local; sci++) { + const unsigned int scl_offset = sci * SCLUSTER_SIZE * 3 * CLUSTER_M; + + for (int ci = 0, scci = scl_offset; ci < atom->siclusters[sci].nclusters; ci++, scci += CLUSTER_M) { + + + MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])]; + MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])]; + MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])]; + + /* + MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])]; + MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])]; + MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])]; + */ + + memcpy(&ci_x[0], &atom->scl_x[scci], CLUSTER_M * sizeof(MD_FLOAT)); + memcpy(&ci_x[0 + CLUSTER_M], &atom->scl_x[scci + SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT)); + memcpy(&ci_x[0 + 2 * CLUSTER_M], &atom->scl_x[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT)); + + memcpy(&ci_v[0], &atom->scl_v[scci], CLUSTER_M * sizeof(MD_FLOAT)); + memcpy(&ci_v[0 + CLUSTER_M], &atom->scl_v[scci + SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT)); + memcpy(&ci_v[0 + 2 * CLUSTER_M], &atom->scl_v[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT)); + + memcpy(&ci_f[0], &atom->scl_f[scci], CLUSTER_M * sizeof(MD_FLOAT)); + memcpy(&ci_f[0 + CLUSTER_M], &atom->scl_f[scci + SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT)); + memcpy(&ci_f[0 + 2 * CLUSTER_M], &atom->scl_f[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT)); + + } + } +} + +__global__ void cudaInitialIntegrateSup_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f, + int *cuda_nclusters, + int *cuda_natoms, + int Nsclusters_local, MD_FLOAT dtforce, MD_FLOAT dt) { + + unsigned int sci_pos = blockDim.x * blockIdx.x + threadIdx.x; + //unsigned int cii_pos = blockDim.y * blockIdx.y + threadIdx.y; + if (sci_pos >= Nsclusters_local) return; + + //unsigned int ci_pos = cii_pos / CLUSTER_M; + //unsigned int scii_pos = cii_pos % CLUSTER_M; + + //if (ci_pos >= cuda_nclusters[sci_pos]) return; + //if (scii_pos >= cuda_natoms[ci_pos]) return; + + int ci_vec_base = SCI_VECTOR_BASE_INDEX(sci_pos); + MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base]; + MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base]; + MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base]; + + for (int scii_pos = 0; scii_pos < SCLUSTER_M; scii_pos++) { + ci_v[SCL_X_OFFSET + scii_pos] += dtforce * ci_f[SCL_X_OFFSET + scii_pos]; + ci_v[SCL_Y_OFFSET + scii_pos] += dtforce * ci_f[SCL_Y_OFFSET + scii_pos]; + ci_v[SCL_Z_OFFSET + scii_pos] += dtforce * ci_f[SCL_Z_OFFSET + scii_pos]; + ci_x[SCL_X_OFFSET + scii_pos] += dt * ci_v[SCL_X_OFFSET + scii_pos]; + ci_x[SCL_Y_OFFSET + scii_pos] += dt * ci_v[SCL_Y_OFFSET + scii_pos]; + ci_x[SCL_Z_OFFSET + scii_pos] += dt * ci_v[SCL_Z_OFFSET + scii_pos]; + } +} + +__global__ void cudaFinalIntegrateSup_warp(MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f, + int *cuda_nclusters, int *cuda_natoms, + int Nsclusters_local, MD_FLOAT dtforce) { + + unsigned int sci_pos = blockDim.x * blockIdx.x + threadIdx.x; + //unsigned int cii_pos = blockDim.y * blockIdx.y + threadIdx.y; + if (sci_pos >= Nsclusters_local) return; + + //unsigned int ci_pos = cii_pos / CLUSTER_M; + //unsigned int scii_pos = cii_pos % CLUSTER_M; + + //if (ci_pos >= cuda_nclusters[sci_pos]) return; + //if (scii_pos >= cuda_natoms[ci_pos]) return; + + int ci_vec_base = SCI_VECTOR_BASE_INDEX(sci_pos); + MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base]; + MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base]; + + for (int scii_pos = 0; scii_pos < SCLUSTER_M; scii_pos++) { + ci_v[SCL_X_OFFSET + scii_pos] += dtforce * ci_f[SCL_X_OFFSET + scii_pos]; + ci_v[SCL_Y_OFFSET + scii_pos] += dtforce * ci_f[SCL_Y_OFFSET + scii_pos]; + ci_v[SCL_Z_OFFSET + scii_pos] += dtforce * ci_f[SCL_Z_OFFSET + scii_pos]; + } + +} + +__global__ void computeForceLJSup_cuda_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_f, + int *cuda_nclusters, int *cuda_iclusters, + int Nsclusters_local, + int *cuda_numneigh, int *cuda_neighs, int half_neigh, int maxneighs, + MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon) { + + unsigned int sci_pos = blockDim.x * blockIdx.x + threadIdx.x; + unsigned int scii_pos = blockDim.y * blockIdx.y + threadIdx.y; + unsigned int cjj_pos = blockDim.z * blockIdx.z + threadIdx.z; + if ((sci_pos >= Nsclusters_local) || (scii_pos >= SCLUSTER_M) || (cjj_pos >= CLUSTER_N)) return; + + unsigned int ci_pos = scii_pos / CLUSTER_M; + unsigned int cii_pos = scii_pos % CLUSTER_M; + + if (ci_pos >= cuda_nclusters[sci_pos]) return; + + int ci_cj0 = CJ0_FROM_CI(ci_pos); + int ci_vec_base = SCI_VECTOR_BASE_INDEX(sci_pos); + MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base]; + MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base]; + + + //int numneighs = cuda_numneigh[ci_pos]; + int numneighs = cuda_numneigh[cuda_iclusters[SCLUSTER_SIZE * sci_pos + ci_pos]]; + + for(int k = 0; k < numneighs; k++) { + int cj = (&cuda_neighs[cuda_iclusters[SCLUSTER_SIZE * sci_pos + ci_pos] * maxneighs])[k]; + // TODO Make cj accessible from super cluster data alignment (not reachable right now) + int cj_vec_base = SCJ_VECTOR_BASE_INDEX(cj); + MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base]; + MD_FLOAT *cj_f = &cuda_cl_f[cj_vec_base]; + + MD_FLOAT xtmp = ci_x[SCL_CL_X_OFFSET(ci_pos) + cii_pos]; + MD_FLOAT ytmp = ci_x[SCL_CL_Y_OFFSET(ci_pos) + cii_pos]; + MD_FLOAT ztmp = ci_x[SCL_CL_Z_OFFSET(ci_pos) + cii_pos]; + MD_FLOAT fix = 0; + MD_FLOAT fiy = 0; + MD_FLOAT fiz = 0; + + int cond; +#if CLUSTER_M == CLUSTER_N + cond = half_neigh ? (ci_cj0 != cj || cii_pos < cjj_pos) : + (ci_cj0 != cj || cii_pos != cjj_pos); +#elif CLUSTER_M < CLUSTER_N + cond = half_neigh ? (ci_cj0 != cj || cii_pos + CLUSTER_M * (ci_pos & 0x1) < cjj_pos) : + (ci_cj0 != cj || cii_pos + CLUSTER_M * (ci_pos & 0x1) != cjj_pos); +#endif + if(cond) { + MD_FLOAT delx = xtmp - cj_x[SCL_CL_X_OFFSET(ci_pos) + cjj_pos]; + MD_FLOAT dely = ytmp - cj_x[SCL_CL_Y_OFFSET(ci_pos) + cjj_pos]; + MD_FLOAT delz = ztmp - cj_x[SCL_CL_Z_OFFSET(ci_pos) + cjj_pos]; + MD_FLOAT rsq = delx * delx + dely * dely + delz * delz; + if(rsq < cutforcesq) { + MD_FLOAT sr2 = 1.0 / rsq; + MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6; + MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon; + + if(half_neigh) { + atomicAdd(&cj_f[SCL_CL_X_OFFSET(ci_pos) + cjj_pos], -delx * force); + atomicAdd(&cj_f[SCL_CL_Y_OFFSET(ci_pos) + cjj_pos], -dely * force); + atomicAdd(&cj_f[SCL_CL_Z_OFFSET(ci_pos) + cjj_pos], -delz * force); + } + + fix += delx * force; + fiy += dely * force; + fiz += delz * force; + + atomicAdd(&ci_f[SCL_CL_X_OFFSET(ci_pos) + cii_pos], fix); + atomicAdd(&ci_f[SCL_CL_Y_OFFSET(ci_pos) + cii_pos], fiy); + atomicAdd(&ci_f[SCL_CL_Z_OFFSET(ci_pos) + cii_pos], fiz); + } + } + } + +} + +extern "C" +double computeForceLJSup_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) { + DEBUG_MESSAGE("computeForceLJSup_cuda start\r\n"); + + MD_FLOAT cutforcesq = param->cutforce * param->cutforce; + MD_FLOAT sigma6 = param->sigma6; + MD_FLOAT epsilon = param->epsilon; + + memsetGPU(cuda_cl_f, 0, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT)); + if (isReneighboured) { + + for(int ci = 0; ci < atom->Nclusters_local; ci++) { + memcpyToGPU(&cuda_numneigh[ci], &neighbor->numneigh[ci], sizeof(int)); + memcpyToGPU(&cuda_neighbors[ci * neighbor->maxneighs], &neighbor->neighbors[ci * neighbor->maxneighs], neighbor->numneigh[ci] * sizeof(int)); + } + + for(int sci = 0; sci < atom->Nsclusters_local; sci++) { + memcpyToGPU(&cuda_nclusters[sci], &atom->siclusters[sci].nclusters, sizeof(int)); + //memcpyToGPU(&cuda_iclusters[sci * SCLUSTER_SIZE], &atom->siclusters[sci].iclusters, sizeof(int) * atom->siclusters[sci].nclusters); + } + + memcpyToGPU(cuda_iclusters, atom->icluster_idx, atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int)); + + isReneighboured = 0; + } + + const int threads_num = 1; + dim3 block_size = dim3(threads_num, SCLUSTER_M, CLUSTER_N); + dim3 grid_size = dim3(atom->Nsclusters_local/threads_num+1, 1, 1); + double S = getTimeStamp(); + LIKWID_MARKER_START("force"); + computeForceLJSup_cuda_warp<<>>(cuda_scl_x, cuda_scl_f, + cuda_nclusters, cuda_iclusters, + atom->Nsclusters_local, + cuda_numneigh, cuda_neighbors, + neighbor->half_neigh, neighbor->maxneighs, cutforcesq, + sigma6, epsilon); + cuda_assert("computeForceLJ_cuda", cudaPeekAtLastError()); + cuda_assert("computeForceLJ_cuda", cudaDeviceSynchronize()); + LIKWID_MARKER_STOP("force"); + double E = getTimeStamp(); + DEBUG_MESSAGE("computeForceLJSup_cuda stop\r\n"); + return E-S; +} +#endif //USE_SUPER_CLUSTERS diff --git a/gromacs/includes/atom.h b/gromacs/includes/atom.h index 8c4cb98..13f9d73 100644 --- a/gromacs/includes/atom.h +++ b/gromacs/includes/atom.h @@ -22,7 +22,25 @@ # define KERNEL_NAME "CUDA" # define CLUSTER_M 8 # define CLUSTER_N VECTOR_WIDTH + +#ifdef USE_SUPER_CLUSTERS +# define XX 0 +# define YY 1 +# define ZZ 2 +# define SCLUSTER_SIZE_X 2 +# define SCLUSTER_SIZE_Y 2 +# define SCLUSTER_SIZE_Z 2 +# define SCLUSTER_SIZE (SCLUSTER_SIZE_X * SCLUSTER_SIZE_Y * SCLUSTER_SIZE_Z) +# define DIM_COORD(dim,coord) ((dim == XX) ? atom_x(coord) : ((dim == YY) ? atom_y(coord) : atom_z(coord))) +# define MIN(a,b) ({int _a = (a), _b = (b); _a < _b ? _a : _b; }) +# define SCLUSTER_M CLUSTER_M * SCLUSTER_SIZE + +# define computeForceLJ computeForceLJSup_cuda +#else # define computeForceLJ computeForceLJ_cuda + +#endif //USE_SUPER_CLUSTERS + # define initialIntegrate cudaInitialIntegrate # define finalIntegrate cudaFinalIntegrate # define updatePbc cudaUpdatePbc @@ -55,16 +73,28 @@ # define CJ1_FROM_CI(a) (a) # define CI_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b)) # define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b)) +#ifdef USE_SUPER_CLUSTERS +# define SCI_BASE_INDEX(a,b) ((a) * CLUSTER_N * SCLUSTER_SIZE * (b)) +# define SCJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * SCLUSTER_SIZE * (b)) +#endif //USE_SUPER_CLUSTERS #elif CLUSTER_M == CLUSTER_N * 2 // M > N # define CJ0_FROM_CI(a) ((a) << 1) # define CJ1_FROM_CI(a) (((a) << 1) | 0x1) # define CI_BASE_INDEX(a,b) ((a) * CLUSTER_M * (b)) # define CJ_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_M * (b) + ((a) & 0x1) * (CLUSTER_M >> 1)) +#ifdef USE_SUPER_CLUSTERS +# define SCI_BASE_INDEX(a,b) ((a) * CLUSTER_M * SCLUSTER_SIZE * (b)) +# define SCJ_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_M * SCLUSTER_SIZE * (b) + ((a) & 0x1) * (SCLUSTER_SIZE * CLUSTER_M >> 1)) +#endif //USE_SUPER_CLUSTERS #elif CLUSTER_M == CLUSTER_N / 2 // M < N # define CJ0_FROM_CI(a) ((a) >> 1) # define CJ1_FROM_CI(a) ((a) >> 1) # define CI_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_N * (b) + ((a) & 0x1) * (CLUSTER_N >> 1)) # define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b)) +#ifdef USE_SUPER_CLUSTERS +# define SCI_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_N * SCLUSTER_SIZE * (b) + ((a) & 0x1) * (CLUSTER_N * SCLUSTER_SIZE >> 1)) +# define SCJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * SCLUSTER_SIZE * (b)) +#endif //USE_SUPER_CLUSTERS #else # error "Invalid cluster configuration!" #endif @@ -78,14 +108,37 @@ #define CJ_SCALAR_BASE_INDEX(a) (CJ_BASE_INDEX(a, 1)) #define CJ_VECTOR_BASE_INDEX(a) (CJ_BASE_INDEX(a, 3)) +#ifdef USE_SUPER_CLUSTERS +#define SCI_SCALAR_BASE_INDEX(a) (SCI_BASE_INDEX(a, 1)) +#define SCI_VECTOR_BASE_INDEX(a) (SCI_BASE_INDEX(a, 3)) +#define SCJ_SCALAR_BASE_INDEX(a) (SCJ_BASE_INDEX(a, 1)) +#define SCJ_VECTOR_BASE_INDEX(a) (SCJ_BASE_INDEX(a, 3)) +#endif //USE_SUPER_CLUSTERS + #if CLUSTER_M >= CLUSTER_N # define CL_X_OFFSET (0 * CLUSTER_M) # define CL_Y_OFFSET (1 * CLUSTER_M) # define CL_Z_OFFSET (2 * CLUSTER_M) + +#ifdef USE_SUPER_CLUSTERS +# define SCL_CL_X_OFFSET(ci) (ci * CLUSTER_M + 0 * SCLUSTER_M) +# define SCL_CL_Y_OFFSET(ci) (ci * CLUSTER_M + 1 * SCLUSTER_M) +# define SCL_CL_Z_OFFSET(ci) (ci * CLUSTER_M + 2 * SCLUSTER_M) + +# define SCL_X_OFFSET (0 * SCLUSTER_M) +# define SCL_Y_OFFSET (1 * SCLUSTER_M) +# define SCL_Z_OFFSET (2 * SCLUSTER_M) +#endif //USE_SUPER_CLUSTERS #else # define CL_X_OFFSET (0 * CLUSTER_N) # define CL_Y_OFFSET (1 * CLUSTER_N) # define CL_Z_OFFSET (2 * CLUSTER_N) + +#ifdef USE_SUPER_CLUSTERS +# define SCL_X_OFFSET (0 * SCLUSTER_SIZE * CLUSTER_N) +# define SCL_Y_OFFSET (1 * SCLUSTER_SIZE * CLUSTER_N) +# define SCL_Z_OFFSET (2 * SCLUSTER_SIZE * CLUSTER_N) +#endif //USE_SUPER_CLUSTERS #endif typedef struct { @@ -95,6 +148,14 @@ typedef struct { MD_FLOAT bbminz, bbmaxz; } Cluster; +typedef struct { + //int *iclusters; + int nclusters; + MD_FLOAT bbminx, bbmaxx; + MD_FLOAT bbminy, bbmaxy; + MD_FLOAT bbminz, bbmaxz; +} SuperCluster; + typedef struct { int Natoms, Nlocal, Nghost, Nmax; int Nclusters, Nclusters_local, Nclusters_ghost, Nclusters_max; @@ -116,6 +177,17 @@ typedef struct { Cluster *iclusters, *jclusters; int *icluster_bin; int dummy_cj; + +#ifdef USE_SUPER_CLUSTERS + int Nsclusters, Nsclusters_local, Nsclusters_ghost, Nsclusters_max; + MD_FLOAT *scl_x; + MD_FLOAT *scl_v; + MD_FLOAT *scl_f; + int *scl_type; + int *icluster_idx; + SuperCluster *siclusters; + int *sicluster_bin; +#endif //USE_SUPER_CLUSTERS } Atom; extern void initAtom(Atom*); @@ -126,6 +198,7 @@ extern int readAtom_gro(Atom*, Parameter*); extern int readAtom_dmp(Atom*, Parameter*); extern void growAtom(Atom*); extern void growClusters(Atom*); +extern void growSuperClusters(Atom*); #ifdef AOS # define POS_DATA_LAYOUT "AoS" diff --git a/gromacs/includes/neighbor.h b/gromacs/includes/neighbor.h index ddaaeed..66f4ee3 100644 --- a/gromacs/includes/neighbor.h +++ b/gromacs/includes/neighbor.h @@ -25,6 +25,7 @@ extern void buildNeighbor(Atom*, Neighbor*); extern void pruneNeighbor(Parameter*, Atom*, Neighbor*); extern void sortAtom(Atom*); extern void buildClusters(Atom*); +extern void buildClustersGPU(Atom*); extern void defineJClusters(Atom*); extern void binClusters(Atom*); extern void updateSingleAtoms(Atom*); diff --git a/gromacs/includes/utils.h b/gromacs/includes/utils.h new file mode 100644 index 0000000..d2c3130 --- /dev/null +++ b/gromacs/includes/utils.h @@ -0,0 +1,17 @@ +/* + * Temporal functions for debugging, remove before proceeding with pull request + */ + +#ifndef MD_BENCH_UTILS_H +#define MD_BENCH_UTILS_H + +#include + +#ifdef USE_SUPER_CLUSTERS +void verifyClusters(Atom *atom); +void verifyLayout(Atom *atom); +void checkAlignment(Atom *atom); +void showSuperclusters(Atom *atom); +#endif //USE_SUPER_CLUSTERS + +#endif //MD_BENCH_UTILS_H diff --git a/gromacs/includes/vtk.h b/gromacs/includes/vtk.h index 5311342..ca81618 100644 --- a/gromacs/includes/vtk.h +++ b/gromacs/includes/vtk.h @@ -9,6 +9,7 @@ #ifndef __VTK_H_ #define __VTK_H_ extern void write_data_to_vtk_file(const char *filename, Atom* atom, int timestep); +extern int write_super_clusters_to_vtk_file(const char* filename, Atom* atom, int timestep); extern int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep); extern int write_ghost_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep); extern int write_local_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep); diff --git a/gromacs/main.c b/gromacs/main.c index 93be03b..5cd66b1 100644 --- a/gromacs/main.c +++ b/gromacs/main.c @@ -38,7 +38,16 @@ extern double computeForceLJ_cuda(Parameter *param, Atom *atom, Neighbor *neighb extern void copyDataToCUDADevice(Atom *atom); extern void copyDataFromCUDADevice(Atom *atom); extern void cudaDeviceFree(); -#endif + +#ifdef USE_SUPER_CLUSTERS +#include +extern void buildNeighborGPU(Atom *atom, Neighbor *neighbor); +extern void pruneNeighborGPU(Parameter *param, Atom *atom, Neighbor *neighbor); +extern void alignDataToSuperclusters(Atom *atom); +extern void alignDataFromSuperclusters(Atom *atom); +extern double computeForceLJSup_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats); +#endif //USE_SUPER_CLUSTERS +#endif //CUDA_TARGET double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) { if(param->force_field == FF_EAM) { initEam(eam, param); } @@ -62,11 +71,19 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats * setupNeighbor(param, atom); setupThermo(param, atom->Natoms); if(param->input_file == NULL) { adjustThermo(param, atom); } + #if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS) + buildClustersGPU(atom); + #else buildClusters(atom); + #endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS) defineJClusters(atom); setupPbc(atom, param); binClusters(atom); + #if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS) + buildNeighborGPU(atom, neighbor); + #else buildNeighbor(atom, neighbor); + #endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS) initDevice(atom, neighbor); E = getTimeStamp(); return E-S; @@ -78,11 +95,19 @@ double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) { LIKWID_MARKER_START("reneighbour"); updateSingleAtoms(atom); updateAtomsPbc(atom, param); + #if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS) + buildClustersGPU(atom); + #else buildClusters(atom); + #endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS) defineJClusters(atom); setupPbc(atom, param); binClusters(atom); + #if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS) + buildNeighborGPU(atom, neighbor); + #else buildNeighbor(atom, neighbor); + #endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS) LIKWID_MARKER_STOP("reneighbour"); E = getTimeStamp(); return E-S; @@ -237,11 +262,18 @@ int main(int argc, char** argv) { } for(int n = 0; n < param.ntimes; n++) { + + //printf("Step:\t%d\r\n", n); + initialIntegrate(¶m, &atom); if((n + 1) % param.reneigh_every) { if(!((n + 1) % param.prune_every)) { + #if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS) + pruneNeighborGPU(¶m, &atom, &neighbor); + #else pruneNeighbor(¶m, &atom, &neighbor); + #endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS) } updatePbc(&atom, ¶m, 0); diff --git a/gromacs/neighbor.c b/gromacs/neighbor.c index fea0d2f..775c5dd 100644 --- a/gromacs/neighbor.c +++ b/gromacs/neighbor.c @@ -77,8 +77,13 @@ void setupNeighbor(Parameter *param, Atom *atom) { MD_FLOAT atom_density = ((MD_FLOAT)(atom->Nlocal)) / ((xhi - xlo) * (yhi - ylo) * (zhi - zlo)); MD_FLOAT atoms_in_cell = MAX(CLUSTER_M, CLUSTER_N); + #ifdef USE_SUPER_CLUSTERS + MD_FLOAT targetsizex = cbrt(atoms_in_cell / atom_density) * (MD_FLOAT)SCLUSTER_SIZE_X; + MD_FLOAT targetsizey = cbrt(atoms_in_cell / atom_density) * (MD_FLOAT)SCLUSTER_SIZE_Y; + #else MD_FLOAT targetsizex = cbrt(atoms_in_cell / atom_density); MD_FLOAT targetsizey = cbrt(atoms_in_cell / atom_density); + #endif nbinx = MAX(1, (int)ceil((xhi - xlo) / targetsizex)); nbiny = MAX(1, (int)ceil((yhi - ylo) / targetsizey)); binsizex = (xhi - xlo) / nbinx; @@ -364,6 +369,197 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) { DEBUG_MESSAGE("buildNeighbor end\n"); } +#ifdef USE_SUPER_CLUSTERS +// TODO For future parallelization on GPU +void buildNeighborGPU(Atom *atom, Neighbor *neighbor) { + DEBUG_MESSAGE("buildNeighborGPU start\n"); + + /* extend atom arrays if necessary */ + if(atom->Nclusters_local > nmax) { + nmax = atom->Nclusters_local; + //nmax = atom->Nsclusters_local; + if(neighbor->numneigh) free(neighbor->numneigh); + if(neighbor->neighbors) free(neighbor->neighbors); + neighbor->numneigh = (int*) malloc(nmax * sizeof(int)); + neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*)); + } + + MD_FLOAT bbx = 0.5 * (binsizex + binsizex); + MD_FLOAT bby = 0.5 * (binsizey + binsizey); + MD_FLOAT rbb_sq = MAX(0.0, cutneigh - 0.5 * sqrt(bbx * bbx + bby * bby)); + rbb_sq = rbb_sq * rbb_sq; + int resize = 1; + + /* loop over each atom, storing neighbors */ + while(resize) { + int new_maxneighs = neighbor->maxneighs; + resize = 0; + + //for (int sci = 0; sci < atom->Nsclusters_local; sci++) { + //for (int scii = 0; scii < atom->siclusters[sci].nclusters; scii++) { + for(int ci = 0; ci < atom->Nclusters_local; ci++) { + //const int ci = atom->siclusters[sci].iclusters[scii]; + //const int ci = atom->icluster_idx[SCLUSTER_SIZE * sci + scii]; + int ci_cj1 = CJ1_FROM_CI(ci); + int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]); + //int *neighptr = &(neighbor->neighbors[sci * neighbor->maxneighs]); + int n = 0; + int ibin = atom->icluster_bin[ci]; + MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx; + MD_FLOAT ibb_xmax = atom->iclusters[ci].bbmaxx; + MD_FLOAT ibb_ymin = atom->iclusters[ci].bbminy; + MD_FLOAT ibb_ymax = atom->iclusters[ci].bbmaxy; + MD_FLOAT ibb_zmin = atom->iclusters[ci].bbminz; + MD_FLOAT ibb_zmax = atom->iclusters[ci].bbmaxz; + + for(int k = 0; k < nstencil; k++) { + int jbin = ibin + stencil[k]; + int *loc_bin = &bin_clusters[jbin * clusters_per_bin]; + int cj, m = -1; + MD_FLOAT jbb_xmin, jbb_xmax, jbb_ymin, jbb_ymax, jbb_zmin, jbb_zmax; + const int c = bin_nclusters[jbin]; + + if(c > 0) { + MD_FLOAT dl, dh, dm, dm0, d_bb_sq; + + do { + m++; + cj = loc_bin[m]; + if(neighbor->half_neigh && ci_cj1 > cj) { + continue; + } + jbb_zmin = atom->jclusters[cj].bbminz; + jbb_zmax = atom->jclusters[cj].bbmaxz; + dl = ibb_zmin - jbb_zmax; + dh = jbb_zmin - ibb_zmax; + dm = MAX(dl, dh); + dm0 = MAX(dm, 0.0); + d_bb_sq = dm0 * dm0; + } while(m + 1 < c && d_bb_sq > cutneighsq); + + jbb_xmin = atom->jclusters[cj].bbminx; + jbb_xmax = atom->jclusters[cj].bbmaxx; + jbb_ymin = atom->jclusters[cj].bbminy; + jbb_ymax = atom->jclusters[cj].bbmaxy; + + while(m < c) { + if(!neighbor->half_neigh || ci_cj1 <= cj) { + dl = ibb_zmin - jbb_zmax; + dh = jbb_zmin - ibb_zmax; + dm = MAX(dl, dh); + dm0 = MAX(dm, 0.0); + d_bb_sq = dm0 * dm0; + + /*if(d_bb_sq > cutneighsq) { + break; + }*/ + + dl = ibb_ymin - jbb_ymax; + dh = jbb_ymin - ibb_ymax; + dm = MAX(dl, dh); + dm0 = MAX(dm, 0.0); + d_bb_sq += dm0 * dm0; + + dl = ibb_xmin - jbb_xmax; + dh = jbb_xmin - ibb_xmax; + dm = MAX(dl, dh); + dm0 = MAX(dm, 0.0); + d_bb_sq += dm0 * dm0; + + if(d_bb_sq < cutneighsq) { + if(d_bb_sq < rbb_sq || atomDistanceInRange(atom, ci, cj, cutneighsq)) { + neighptr[n++] = cj; + } + } + } + + m++; + if(m < c) { + cj = loc_bin[m]; + jbb_xmin = atom->jclusters[cj].bbminx; + jbb_xmax = atom->jclusters[cj].bbmaxx; + jbb_ymin = atom->jclusters[cj].bbminy; + jbb_ymax = atom->jclusters[cj].bbmaxy; + jbb_zmin = atom->jclusters[cj].bbminz; + jbb_zmax = atom->jclusters[cj].bbmaxz; + } + } + } + } + + // Fill neighbor list with dummy values to fit vector width + if(CLUSTER_N < VECTOR_WIDTH) { + while(n % (VECTOR_WIDTH / CLUSTER_N)) { + neighptr[n++] = atom->dummy_cj; // Last cluster is always a dummy cluster + } + } + + //neighbor->numneigh[sci] = n; + neighbor->numneigh[ci] = n; + if(n >= neighbor->maxneighs) { + resize = 1; + + if(n >= new_maxneighs) { + new_maxneighs = n; + } + } + } + //} + + if(resize) { + fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs); + neighbor->maxneighs = new_maxneighs * 1.2; + free(neighbor->neighbors); + neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int)); + } + } + + /* + DEBUG_MESSAGE("\ncutneighsq = %f, rbb_sq = %f\n", cutneighsq, rbb_sq); + for(int ci = 0; ci < 6; ci++) { + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + int* neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]); + + DEBUG_MESSAGE("Cluster %d, bbx = {%f, %f}, bby = {%f, %f}, bbz = {%f, %f}\n", + ci, + atom->iclusters[ci].bbminx, + atom->iclusters[ci].bbmaxx, + atom->iclusters[ci].bbminy, + atom->iclusters[ci].bbmaxy, + atom->iclusters[ci].bbminz, + atom->iclusters[ci].bbmaxz); + + for(int cii = 0; cii < CLUSTER_M; cii++) { + DEBUG_MESSAGE("%f, %f, %f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]); + } + + DEBUG_MESSAGE("Neighbors:\n"); + for(int k = 0; k < neighbor->numneigh[ci]; k++) { + int cj = neighptr[k]; + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; + + DEBUG_MESSAGE(" Cluster %d, bbx = {%f, %f}, bby = {%f, %f}, bbz = {%f, %f}\n", + cj, + atom->jclusters[cj].bbminx, + atom->jclusters[cj].bbmaxx, + atom->jclusters[cj].bbminy, + atom->jclusters[cj].bbmaxy, + atom->jclusters[cj].bbminz, + atom->jclusters[cj].bbmaxz); + + for(int cjj = 0; cjj < CLUSTER_N; cjj++) { + DEBUG_MESSAGE(" %f, %f, %f\n", cj_x[CL_X_OFFSET + cjj], cj_x[CL_Y_OFFSET + cjj], cj_x[CL_Z_OFFSET + cjj]); + } + } + } + */ + + DEBUG_MESSAGE("buildNeighborGPU end\n"); +} +#endif //USE_SUPER_CLUSTERS + void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) { DEBUG_MESSAGE("pruneNeighbor start\n"); //MD_FLOAT cutsq = param->cutforce * param->cutforce; @@ -404,6 +600,53 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) { DEBUG_MESSAGE("pruneNeighbor end\n"); } +#ifdef USE_SUPER_CLUSTERS +void pruneNeighborGPU(Parameter *param, Atom *atom, Neighbor *neighbor) { + DEBUG_MESSAGE("pruneNeighbor start\n"); + //MD_FLOAT cutsq = param->cutforce * param->cutforce; + MD_FLOAT cutsq = cutneighsq; + + for (int sci = 0; sci < atom->Nsclusters_local; sci++) { + for (int scii = 0; scii < atom->siclusters[sci].nclusters; scii++) { + //const int ci = atom->siclusters[sci].iclusters[scii]; + const int ci = atom->icluster_idx[SCLUSTER_SIZE * sci + ci]; + + int *neighs = &neighbor->neighbors[sci * neighbor->maxneighs]; + int numneighs = neighbor->numneigh[sci]; + int k = 0; + + // Remove dummy clusters if necessary + if(CLUSTER_N < VECTOR_WIDTH) { + while(neighs[numneighs - 1] == atom->dummy_cj) { + numneighs--; + } + } + + while(k < numneighs) { + int cj = neighs[k]; + if(atomDistanceInRange(atom, ci, cj, cutsq)) { + k++; + } else { + numneighs--; + neighs[k] = neighs[numneighs]; + } + } + + // Readd dummy clusters if necessary + if(CLUSTER_N < VECTOR_WIDTH) { + while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) { + neighs[numneighs++] = atom->dummy_cj; // Last cluster is always a dummy cluster + } + } + + neighbor->numneigh[sci] = numneighs; + } + } + + DEBUG_MESSAGE("pruneNeighbor end\n"); +} +#endif //USE_SUPER_CLUSTERS + /* internal subroutines */ MD_FLOAT bindist(int i, int j) { MD_FLOAT delx, dely, delz; @@ -529,6 +772,36 @@ void sortAtomsByZCoord(Atom *atom) { DEBUG_MESSAGE("sortAtomsByZCoord end\n"); } +#ifdef USE_SUPER_CLUSTERS +// TODO: Use pigeonhole sorting +void sortAtomsByCoord(Atom *atom, int dim, int bin, int start_index, int end_index) { + //DEBUG_MESSAGE("sortAtomsByCoord start\n"); + int *bin_ptr = &bins[bin * atoms_per_bin]; + + for(int ac_i = start_index; ac_i <= end_index; ac_i++) { + int i = bin_ptr[ac_i]; + int min_ac = ac_i; + int min_idx = i; + MD_FLOAT min_coord = DIM_COORD(dim, i); + + for(int ac_j = ac_i + 1; ac_j <= end_index; ac_j++) { + int j = bin_ptr[ac_j]; + MD_FLOAT coordj = DIM_COORD(dim, j); + if(coordj < min_coord) { + min_ac = ac_j; + min_idx = j; + min_coord = coordj; + } + } + + bin_ptr[ac_i] = min_idx; + bin_ptr[min_ac] = i; + } + + //DEBUG_MESSAGE("sortAtomsByCoord end\n"); +} +#endif //USE_SUPER_CLUSTERS + void buildClusters(Atom *atom) { DEBUG_MESSAGE("buildClusters start\n"); atom->Nclusters_local = 0; @@ -605,6 +878,172 @@ void buildClusters(Atom *atom) { DEBUG_MESSAGE("buildClusters end\n"); } +#ifdef USE_SUPER_CLUSTERS +void buildClustersGPU(Atom *atom) { + DEBUG_MESSAGE("buildClustersGPU start\n"); + atom->Nclusters_local = 0; + + /* bin local atoms */ + binAtoms(atom); + + for(int bin = 0; bin < mbins; bin++) { + int c = bincount[bin]; + sortAtomsByCoord(atom, ZZ, bin, 0, c - 1); + int ac = 0; + int nclusters = ((c + CLUSTER_M - 1) / CLUSTER_M); + if(CLUSTER_N > CLUSTER_M && nclusters % 2) { nclusters++; } + + int n_super_clusters_xy = nclusters / (SCLUSTER_SIZE_X * SCLUSTER_SIZE_Y); + if (nclusters % (SCLUSTER_SIZE_X * SCLUSTER_SIZE_Y)) n_super_clusters_xy++; + int n_super_clusters = n_super_clusters_xy / SCLUSTER_SIZE_Z; + if (n_super_clusters_xy % SCLUSTER_SIZE_Z) n_super_clusters++; + + //printf("%d\t%d\t%d\t%d\r\n", nclusters, n_super_clusters_xy, n_super_clusters, (SCLUSTER_SIZE_X * SCLUSTER_SIZE_Y * SCLUSTER_SIZE_Z)*n_super_clusters); + + int cl_count = 0; + + for (int scl = 0; scl < n_super_clusters; scl++) { + const int sci = atom->Nsclusters_local; + if(sci >= atom->Nsclusters_max) { + growSuperClusters(atom); + } + + if (cl_count >= nclusters) break; + + int scl_offset = scl * SCLUSTER_SIZE * CLUSTER_M; + + MD_FLOAT sc_bbminx = INFINITY, sc_bbmaxx = -INFINITY; + MD_FLOAT sc_bbminy = INFINITY, sc_bbmaxy = -INFINITY; + MD_FLOAT sc_bbminz = INFINITY, sc_bbmaxz = -INFINITY; + + for (int scl_z = 0; scl_z < SCLUSTER_SIZE_Z; scl_z++) { + + if (cl_count >= nclusters) break; + + const int atom_scl_z_offset = scl_offset + scl_z * SCLUSTER_SIZE_Y * SCLUSTER_SIZE_X * CLUSTER_M; + + + const int atom_scl_z_end_idx = MIN(atom_scl_z_offset + + SCLUSTER_SIZE_Y * SCLUSTER_SIZE_X * CLUSTER_M - 1, c - 1); + + sortAtomsByCoord(atom, YY, bin, atom_scl_z_offset, atom_scl_z_end_idx); + + //printf("%d\t%d\t%d\t%d\t%d\t%d\r\n", nclusters, scl, scl_z, atom_scl_z_offset, atom_scl_z_end_idx, c); + + for (int scl_y = 0; scl_y < SCLUSTER_SIZE_Y; scl_y++) { + + if (cl_count >= nclusters) break; + + const int atom_scl_y_offset = scl_offset + + scl_z * SCLUSTER_SIZE_Y * SCLUSTER_SIZE_X * CLUSTER_M + + scl_y * SCLUSTER_SIZE_Y * CLUSTER_M; + + const int atom_scl_y_end_idx = MIN(atom_scl_y_offset + + SCLUSTER_SIZE_X * CLUSTER_M - 1, c - 1); + + //printf("%d\t%d\t%d\t%d\t%d\t%d\t%d\r\n", nclusters, scl, scl_z, scl_y, atom_scl_y_offset, atom_scl_y_end_idx, c); + + sortAtomsByCoord(atom, XX, bin, atom_scl_y_offset, atom_scl_y_end_idx); + + + for (int scl_x = 0; scl_x < SCLUSTER_SIZE_X; scl_x++) { + if (cl_count >= nclusters) break; + cl_count++; + + const int cluster_sup_idx = scl_z * SCLUSTER_SIZE_Z * SCLUSTER_SIZE_Y + + scl_y * SCLUSTER_SIZE_X + scl_x; + + const int ci = atom->Nclusters_local; + if(ci >= atom->Nclusters_max) { + growClusters(atom); + } + + int ci_sca_base = CI_SCALAR_BASE_INDEX(ci); + int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base]; + int *ci_type = &atom->cl_type[ci_sca_base]; + MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY; + MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY; + MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY; + + atom->iclusters[ci].natoms = 0; + for(int cii = 0; cii < CLUSTER_M; cii++) { + if(ac < c) { + int i = bins[bin * atoms_per_bin + ac]; + MD_FLOAT xtmp = atom_x(i); + MD_FLOAT ytmp = atom_y(i); + MD_FLOAT ztmp = atom_z(i); + + ci_x[CL_X_OFFSET + cii] = xtmp; + ci_x[CL_Y_OFFSET + cii] = ytmp; + ci_x[CL_Z_OFFSET + cii] = ztmp; + ci_v[CL_X_OFFSET + cii] = atom->vx[i]; + ci_v[CL_Y_OFFSET + cii] = atom->vy[i]; + ci_v[CL_Z_OFFSET + cii] = atom->vz[i]; + + // TODO: To create the bounding boxes faster, we can use SIMD operations + if(bbminx > xtmp) { bbminx = xtmp; } + if(bbmaxx < xtmp) { bbmaxx = xtmp; } + if(bbminy > ytmp) { bbminy = ytmp; } + if(bbmaxy < ytmp) { bbmaxy = ytmp; } + if(bbminz > ztmp) { bbminz = ztmp; } + if(bbmaxz < ztmp) { bbmaxz = ztmp; } + + ci_type[cii] = atom->type[i]; + atom->iclusters[ci].natoms++; + } else { + ci_x[CL_X_OFFSET + cii] = INFINITY; + ci_x[CL_Y_OFFSET + cii] = INFINITY; + ci_x[CL_Z_OFFSET + cii] = INFINITY; + } + + ac++; + } + + atom->icluster_bin[ci] = bin; + atom->iclusters[ci].bbminx = bbminx; + atom->iclusters[ci].bbmaxx = bbmaxx; + atom->iclusters[ci].bbminy = bbminy; + atom->iclusters[ci].bbmaxy = bbmaxy; + atom->iclusters[ci].bbminz = bbminz; + atom->iclusters[ci].bbmaxz = bbmaxz; + atom->Nclusters_local++; + + // TODO: To create the bounding boxes faster, we can use SIMD operations + if(sc_bbminx > bbminx) { sc_bbminx = bbminx; } + if(sc_bbmaxx < bbmaxx) { sc_bbmaxx = bbmaxx; } + if(sc_bbminy > bbminy) { sc_bbminy = bbminy; } + if(sc_bbmaxy < bbmaxy) { sc_bbmaxy = bbmaxy; } + if(sc_bbminz > bbminz) { sc_bbminz = bbminz; } + if(sc_bbmaxz < bbmaxz) { sc_bbmaxz = bbmaxz; } + + atom->siclusters[sci].nclusters++; + atom->icluster_idx[SCLUSTER_SIZE * sci + cluster_sup_idx] = ci; + //atom->siclusters[sci].iclusters[cluster_sup_idx] = ci; + + } + } + } + + atom->sicluster_bin[sci] = bin; + atom->siclusters[sci].bbminx = sc_bbminx; + atom->siclusters[sci].bbmaxx = sc_bbmaxx; + atom->siclusters[sci].bbminy = sc_bbminy; + atom->siclusters[sci].bbmaxy = sc_bbmaxy; + atom->siclusters[sci].bbminz = sc_bbminz; + atom->siclusters[sci].bbmaxz = sc_bbmaxz; + atom->Nsclusters_local++; + } + + //printf("%d\t%d\r\n", (SCLUSTER_SIZE_X * SCLUSTER_SIZE_Y * SCLUSTER_SIZE_Z)*n_super_clusters, count); + + } + + DEBUG_MESSAGE("buildClustersGPU end\n"); +} +#endif //USE_SUPER_CLUSTERS + void defineJClusters(Atom *atom) { DEBUG_MESSAGE("defineJClusters start\n"); diff --git a/gromacs/utils.c b/gromacs/utils.c new file mode 100644 index 0000000..1d9db02 --- /dev/null +++ b/gromacs/utils.c @@ -0,0 +1,277 @@ + +/* + * Temporal functions for debugging, remove before proceeding with pull request + */ + +#include + +#include + +extern void alignDataToSuperclusters(Atom *atom); +extern void alignDataFromSuperclusters(Atom *atom); + +#ifdef USE_SUPER_CLUSTERS +/* +void verifyClusters(Atom *atom) { + unsigned int count = 0; + + for (int i = 0; i < atom->Nsclusters_local; i++) { + for (int j = 0; j < atom->siclusters[i].nclusters; j++) { + for(int cii = 0; cii < CLUSTER_M; cii++, count++); + } + } + + MD_FLOAT *x = malloc(count * sizeof(MD_FLOAT)); + MD_FLOAT *y = malloc(count * sizeof(MD_FLOAT)); + MD_FLOAT *z = malloc(count * sizeof(MD_FLOAT)); + + count = 0; + unsigned int diffs = 0; + + printf("######### %d #########\r\n", atom->Nsclusters_local); + for (int i = 0; i < atom->Nsclusters_local; i++) { + printf("######### %d\t #########\r\n", atom->siclusters[i].nclusters); + + for (int j = 0; j < atom->siclusters[i].nclusters; j++) { + //printf("%d\t", atom.siclusters[i].iclusters[j]); + MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[i].iclusters[j])]; + + if (atom->iclusters[atom->siclusters[i].iclusters[j]].bbminx < atom->siclusters[i].bbminx || + atom->iclusters[atom->siclusters[i].iclusters[j]].bbmaxx > atom->siclusters[i].bbmaxx || + atom->iclusters[atom->siclusters[i].iclusters[j]].bbminy < atom->siclusters[i].bbminy || + atom->iclusters[atom->siclusters[i].iclusters[j]].bbmaxy > atom->siclusters[i].bbmaxy || + atom->iclusters[atom->siclusters[i].iclusters[j]].bbminz < atom->siclusters[i].bbminz || + atom->iclusters[atom->siclusters[i].iclusters[j]].bbmaxz > atom->siclusters[i].bbmaxz) diffs++; + + + for(int cii = 0; cii < CLUSTER_M; cii++, count++) { + x[count] = ci_x[CL_X_OFFSET + cii]; + y[count] = ci_x[CL_Y_OFFSET + cii]; + z[count] = ci_x[CL_Z_OFFSET + cii]; + //printf("x: %f\ty: %f\tz: %f\r\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]); + } + } + printf("######### \t #########\r\n"); + } + + printf("######### Diffs: %d\t #########\r\n", diffs); + + printf("\r\n"); + + count = 0; + diffs = 0; + + for (int i = 0; i < atom->Nclusters_local; i++) { + MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(i)]; + + for(int cii = 0; cii < CLUSTER_M; cii++, count++) { + if (ci_x[CL_X_OFFSET + cii] != x[count] || + ci_x[CL_Y_OFFSET + cii] != y[count] || + ci_x[CL_Z_OFFSET + cii] != z[count]) diffs++; + } + } + + printf("######### Diffs: %d\t #########\r\n", diffs); +} + */ + +void verifyLayout(Atom *atom) { + + printf("verifyLayout start\r\n"); + + /* + unsigned int count = 0; + + for (int i = 0; i < atom->Nsclusters_local; i++) { + for (int j = 0; j < atom->siclusters[i].nclusters; j++, count++); + } + + MD_FLOAT *scl_x = malloc(atom->Nsclusters_local * SCLUSTER_SIZE * 3 * CLUSTER_M * sizeof(MD_FLOAT)); + + + for (int sci = 0; sci < atom->Nsclusters_local; sci++) { + const unsigned int scl_offset = sci * SCLUSTER_SIZE * 3 * CLUSTER_M; + + for (int ci = 0, scci = scl_offset; ci < atom->siclusters[sci].nclusters; ci++, scci += CLUSTER_M) { + MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])]; + + const unsigned int atom_offset = scci; + + /* + for(int cii = 0, scii = atom_offset; cii < CLUSTER_M; cii++, scii += 3) { + scl_x[CL_X_OFFSET + scii] = ci_x[CL_X_OFFSET + cii]; + scl_x[CL_Y_OFFSET + scii] = ci_x[CL_Y_OFFSET + cii]; + scl_x[CL_Z_OFFSET + scii] = ci_x[CL_Z_OFFSET + cii]; + //printf("x: %f\ty: %f\tz: %f\r\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]); + } + + + memcpy(&scl_x[atom_offset], &ci_x[0], CLUSTER_M * sizeof(MD_FLOAT)); + memcpy(&scl_x[atom_offset + SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT)); + memcpy(&scl_x[atom_offset + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT)); + + } + } + + */ + //alignDataToSuperclusters(atom); + + //for (int sci = 0; sci < 2; sci++) { + for (int sci = 4; sci < 6; sci++) { + const unsigned int scl_offset = sci * SCLUSTER_SIZE; + + MD_FLOAT *sci_x = &atom->scl_f[SCI_VECTOR_BASE_INDEX(sci)]; + + for (int cii = 0; cii < SCLUSTER_M; ++cii) { + + const unsigned int cl_idx = cii / CLUSTER_M; + const unsigned int ciii = cii % CLUSTER_M; + + /* + printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[cii], + sci_x[cii + SCLUSTER_SIZE * CLUSTER_M], sci_x[cii + 2 * SCLUSTER_SIZE * CLUSTER_M]); + */ + + printf("%d\t%d\t%f\t%f\t%f\r\n", atom->icluster_idx[SCLUSTER_SIZE * sci + cl_idx], cl_idx, sci_x[SCL_CL_X_OFFSET(cl_idx) + ciii], + sci_x[SCL_CL_Y_OFFSET(cl_idx) + ciii], sci_x[SCL_CL_Z_OFFSET(cl_idx) + ciii]); + } + + + + /* + //for (int cii = 0; cii < SCLUSTER_M; ++cii) { + for (int cii = 0; cii < SCLUSTER_M; ++cii) { + + const unsigned int cl_idx = cii / CLUSTER_M; + const unsigned int ciii = cii % CLUSTER_M; + + /* + printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[SCL_X_OFFSET(cl_idx) + cii], + sci_x[SCL_Y_OFFSET(cl_idx) + cii], sci_x[SCL_Z_OFFSET(cl_idx) + cii]); + */ + + /* + printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[SCL_X_OFFSET(cl_idx) + ciii], + sci_x[SCL_Y_OFFSET(cl_idx) + ciii], sci_x[SCL_Z_OFFSET(cl_idx) + ciii]); + } + */ + + + + + + + + + + + /* + for (int scii = scl_offset; scii < scl_offset + SCLUSTER_SIZE; scii++) { + + for (int cii = 0; cii < CLUSTER_M; ++cii) { + printf("%f\t%f\t%f\r\n", sci_x[SCL_X_OFFSET(scii) + cii], + sci_x[SCL_Y_OFFSET(scii) + cii], sci_x[SCL_Z_OFFSET(scii) + cii]); + } + /* + + const unsigned int cl_offset = scii * 3 * CLUSTER_M; + //MD_FLOAT *sci_x = &scl_x[CI_VECTOR_BASE_INDEX(scii)]; + + for (int cii = cl_offset; cii < cl_offset + CLUSTER_M; ++cii) { + printf("%f\t%f\t%f\r\n", sci_x[CL_X_OFFSET + cii], + sci_x[CL_Y_OFFSET + cii], sci_x[CL_Z_OFFSET + cii]); + } + */ + + + /* + for (int cii = cl_offset; cii < cl_offset + CLUSTER_M; ++cii) { + printf("%f\t%f\t%f\r\n", scl_x[CL_X_OFFSET + cii], + scl_x[CL_Y_OFFSET + cii], scl_x[CL_Z_OFFSET + cii]); + } + */ + + //} + + printf("##########\t##########\r\n"); + } + + printf("\r\n"); + + //for (int ci = 0; ci < 16; ci++) { + for (int ci = 35; ci < 37; ci++) { + printf("$$$$$$$$$$\t%d\t%d\t$$$$$$$$$$\r\n", ci, atom->icluster_bin[ci]); + MD_FLOAT *ci_x = &atom->cl_f[CI_VECTOR_BASE_INDEX(ci)]; + + //for(int cii = 0; cii < CLUSTER_M; cii++, count++) { + for(int cii = 0; cii < CLUSTER_M; cii++) { + + printf("%f\t%f\t%f\r\n", ci_x[CL_X_OFFSET + cii], + ci_x[CL_Y_OFFSET + cii], + ci_x[CL_Z_OFFSET + cii]); + } + printf("##########\t##########\r\n"); + } + + printf("verifyLayout end\r\n"); + + /* + for (int i = 0; i < atom->Nclusters_local; i++) { + MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(i)]; + + for(int cii = 0; cii < CLUSTER_M; cii++, count++) { + if (ci_x[CL_X_OFFSET + cii] != x[count] || + ci_x[CL_Y_OFFSET + cii] != y[count] || + ci_x[CL_Z_OFFSET + cii] != z[count]) diffs++; + } + } + */ +} + +void checkAlignment(Atom *atom) { + alignDataToSuperclusters(atom); + + for (int sci = 4; sci < 6; sci++) { + MD_FLOAT *sci_x = &atom->scl_x[SCI_VECTOR_BASE_INDEX(sci)]; + + for (int cii = 0; cii < SCLUSTER_M; ++cii) { + + const unsigned int cl_idx = cii / CLUSTER_M; + const unsigned int ciii = cii % CLUSTER_M; + + printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[SCL_CL_X_OFFSET(cl_idx) + ciii], + sci_x[SCL_CL_Y_OFFSET(cl_idx) + ciii], sci_x[SCL_CL_Z_OFFSET(cl_idx) + ciii]); + } + + } + + for (int ci = 35; ci < 37; ci++) { + printf("$$$$$$$$$$\t%d\t%d\t$$$$$$$$$$\r\n", ci, atom->icluster_bin[ci]); + MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(ci)]; + + for(int cii = 0; cii < CLUSTER_M; cii++) { + + printf("%f\t%f\t%f\r\n", ci_x[CL_X_OFFSET + cii], + ci_x[CL_Y_OFFSET + cii], + ci_x[CL_Z_OFFSET + cii]); + } + printf("##########\t##########\r\n"); + } +} + +void showSuperclusters(Atom *atom) { + for (int sci = 4; sci < 6; sci++) { + MD_FLOAT *sci_x = &atom->scl_x[SCI_VECTOR_BASE_INDEX(sci)]; + + for (int cii = 0; cii < SCLUSTER_M; ++cii) { + + const unsigned int cl_idx = cii / CLUSTER_M; + const unsigned int ciii = cii % CLUSTER_M; + + printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[SCL_CL_X_OFFSET(cl_idx) + ciii], + sci_x[SCL_CL_Y_OFFSET(cl_idx) + ciii], sci_x[SCL_CL_Z_OFFSET(cl_idx) + ciii]); + } + + } +} +#endif //USE_SUPER_CLUSTERS diff --git a/gromacs/vtk.c b/gromacs/vtk.c index b683dc8..5bb91cc 100644 --- a/gromacs/vtk.c +++ b/gromacs/vtk.c @@ -15,8 +15,61 @@ void write_data_to_vtk_file(const char *filename, Atom* atom, int timestep) { write_ghost_atoms_to_vtk_file(filename, atom, timestep); write_local_cluster_edges_to_vtk_file(filename, atom, timestep); write_ghost_cluster_edges_to_vtk_file(filename, atom, timestep); +#ifdef USE_SUPER_CLUSTERS + write_super_clusters_to_vtk_file(filename, atom, timestep); +#endif //#ifdef USE_SUPER_CLUSTERS } +#ifdef USE_SUPER_CLUSTERS +int write_super_clusters_to_vtk_file(const char* filename, Atom* atom, int timestep) { + char timestep_filename[128]; + snprintf(timestep_filename, sizeof timestep_filename, "%s_sup_%d.vtk", filename, timestep); + FILE* fp = fopen(timestep_filename, "wb"); + + if(fp == NULL) { + fprintf(stderr, "Could not open VTK file for writing!\n"); + return -1; + } + + fprintf(fp, "# vtk DataFile Version 2.0\n"); + fprintf(fp, "Particle data\n"); + fprintf(fp, "ASCII\n"); + fprintf(fp, "DATASET UNSTRUCTURED_GRID\n"); + fprintf(fp, "POINTS %d double\n", atom->Nsclusters_local * SCLUSTER_M); + for(int ci = 0; ci < atom->Nsclusters_local; ++ci) { + + int factor = (rand() % 1000) + 1; + //double factor = ci * 10; + + int ci_vec_base = SCI_VECTOR_BASE_INDEX(ci); + MD_FLOAT *ci_x = &atom->scl_x[ci_vec_base]; + for(int cii = 0; cii < SCLUSTER_M; ++cii) { + fprintf(fp, "%.4f %.4f %.4f\n", ci_x[SCL_X_OFFSET + cii] * factor, ci_x[SCL_Y_OFFSET + cii] * factor, ci_x[SCL_Z_OFFSET + cii] * factor); + } + } + fprintf(fp, "\n\n"); + fprintf(fp, "CELLS %d %d\n", atom->Nlocal, atom->Nlocal * 2); + for(int i = 0; i < atom->Nlocal; ++i) { + fprintf(fp, "1 %d\n", i); + } + fprintf(fp, "\n\n"); + fprintf(fp, "CELL_TYPES %d\n", atom->Nlocal); + for(int i = 0; i < atom->Nlocal; ++i) { + fprintf(fp, "1\n"); + } + fprintf(fp, "\n\n"); + fprintf(fp, "POINT_DATA %d\n", atom->Nlocal); + fprintf(fp, "SCALARS mass double\n"); + fprintf(fp, "LOOKUP_TABLE default\n"); + for(int i = 0; i < atom->Nlocal; i++) { + fprintf(fp, "1.0\n"); + } + fprintf(fp, "\n\n"); + fclose(fp); + return 0; +} +#endif //USE_SUPER_CLUSTERS + int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) { char timestep_filename[128]; snprintf(timestep_filename, sizeof timestep_filename, "%s_local_%d.vtk", filename, timestep); diff --git a/include_NVCC.mk b/include_NVCC.mk index de7a370..7e412c1 100644 --- a/include_NVCC.mk +++ b/include_NVCC.mk @@ -8,7 +8,8 @@ ANSI_CFLAGS += -Wextra # # A100 + Native -CFLAGS = -O3 -arch=sm_80 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp +#CFLAGS = -O3 -arch=sm_80 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp +CFLAGS = -O3 -arch=compute_61 -code=sm_61,sm_80,sm_86 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp # A40 + Native #CFLAGS = -O3 -arch=sm_86 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp # Cascade Lake