Building of super clusters complete, force computation kernel WIP

This commit is contained in:
Andropov Arsenii 2022-12-18 14:28:29 +01:00
parent b20e8c6986
commit ee3f6de050
14 changed files with 1326 additions and 4 deletions

View File

@ -97,6 +97,10 @@ ifeq ($(strip $(USE_SIMD_KERNEL)),true)
DEFINES += -DUSE_SIMD_KERNEL DEFINES += -DUSE_SIMD_KERNEL
endif endif
ifeq ($(strip $(USE_SUPER_CLUSTERS)),true)
DEFINES += -DUSE_SUPER_CLUSTERS
endif
VPATH = $(SRC_DIR) $(ASM_DIR) $(CUDA_DIR) VPATH = $(SRC_DIR) $(ASM_DIR) $(CUDA_DIR)
ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c)) ASM = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s,$(wildcard $(SRC_DIR)/*.c))
OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s)) OVERWRITE:= $(patsubst $(ASM_DIR)/%-new.s, $(BUILD_DIR)/%.o,$(wildcard $(ASM_DIR)/*-new.s))

View File

@ -1,5 +1,5 @@
# Compiler tag (GCC/CLANG/ICC/ICX/ONEAPI/NVCC) # Compiler tag (GCC/CLANG/ICC/ICX/ONEAPI/NVCC)
TAG ?= ICC TAG ?= NVCC
# Instruction set (SSE/AVX/AVX_FMA/AVX2/AVX512) # Instruction set (SSE/AVX/AVX_FMA/AVX2/AVX512)
ISA ?= AVX512 ISA ?= AVX512
# Optimization scheme (lammps/gromacs/clusters_per_bin) # Optimization scheme (lammps/gromacs/clusters_per_bin)
@ -41,6 +41,7 @@ HALF_NEIGHBOR_LISTS_CHECK_CJ ?= false
# Configurations for CUDA # Configurations for CUDA
# Use CUDA host memory to optimize transfers # Use CUDA host memory to optimize transfers
USE_CUDA_HOST_MEMORY ?= false USE_CUDA_HOST_MEMORY ?= false
USE_SUPER_CLUSTERS ?= true
#Feature options #Feature options
OPTIONS = -DALIGNMENT=64 OPTIONS = -DALIGNMENT=64

View File

@ -37,6 +37,24 @@ void initAtom(Atom *atom) {
atom->iclusters = NULL; atom->iclusters = NULL;
atom->jclusters = NULL; atom->jclusters = NULL;
atom->icluster_bin = NULL; atom->icluster_bin = NULL;
#ifdef USE_SUPER_CLUSTERS
atom->scl_x = NULL;
atom->scl_v = NULL;
atom->scl_f = NULL;
atom->Nsclusters = 0;
atom->Nsclusters_local = 0;
atom->Nsclusters_ghost = 0;
atom->Nsclusters_max = 0;
atom->scl_type = NULL;
atom->siclusters = NULL;
atom->icluster_idx = NULL;
atom->sicluster_bin = NULL;
#endif //USE_SUPER_CLUSTERS
} }
void createAtom(Atom *atom, Parameter *param) { void createAtom(Atom *atom, Parameter *param) {
@ -421,3 +439,25 @@ void growClusters(Atom *atom) {
atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT)); atom->cl_v = (MD_FLOAT*) reallocate(atom->cl_v, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT), nold * CLUSTER_M * 3 * sizeof(MD_FLOAT));
atom->cl_type = (int*) reallocate(atom->cl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * sizeof(int), nold * CLUSTER_M * sizeof(int)); atom->cl_type = (int*) reallocate(atom->cl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * sizeof(int), nold * CLUSTER_M * sizeof(int));
} }
#ifdef USE_SUPER_CLUSTERS
void growSuperClusters(Atom *atom) {
int nold = atom->Nsclusters_max;
atom->Nsclusters_max += DELTA;
atom->siclusters = (SuperCluster*) reallocate(atom->siclusters, ALIGNMENT, atom->Nsclusters_max * sizeof(SuperCluster), nold * sizeof(SuperCluster));
atom->icluster_idx = (int*) reallocate(atom->icluster_idx, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int), nold * SCLUSTER_SIZE * sizeof(int));
atom->sicluster_bin = (int*) reallocate(atom->sicluster_bin, ALIGNMENT, atom->Nsclusters_max * sizeof(int), nold * sizeof(int));
//atom->scl_type = (int*) reallocate(atom->scl_type, ALIGNMENT, atom->Nclusters_max * CLUSTER_M * SCLUSTER_SIZE * sizeof(int), nold * CLUSTER_M * SCLUSTER_SIZE * sizeof(int));
atom->scl_x = (MD_FLOAT*) reallocate(atom->scl_x, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT), nold * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
atom->scl_f = (MD_FLOAT*) reallocate(atom->scl_f, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT), nold * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
atom->scl_v = (MD_FLOAT*) reallocate(atom->scl_v, ALIGNMENT, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT), nold * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
/*
for (int sci = 0; sci < atom->Nsclusters_max; sci++) {
atom->siclusters[sci].iclusters = (int*) reallocate(atom->siclusters[sci].iclusters, ALIGNMENT, SCLUSTER_SIZE * sizeof(int), SCLUSTER_SIZE * sizeof(int));
}
*/
}
#endif //USE_SUPER_CLUSTERS

View File

@ -39,8 +39,29 @@ extern "C" {
MD_FLOAT *cuda_bbminz, *cuda_bbmaxz; MD_FLOAT *cuda_bbminz, *cuda_bbmaxz;
int *cuda_PBCx, *cuda_PBCy, *cuda_PBCz; int *cuda_PBCx, *cuda_PBCy, *cuda_PBCz;
int isReneighboured; int isReneighboured;
int *cuda_iclusters;
int *cuda_nclusters;
int cuda_max_scl;
MD_FLOAT *cuda_scl_x;
MD_FLOAT *cuda_scl_v;
MD_FLOAT *cuda_scl_f;
extern void alignDataToSuperclusters(Atom *atom);
extern void alignDataFromSuperclusters(Atom *atom);
extern double computeForceLJSup_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats);
} }
extern __global__ void cudaInitialIntegrateSup_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
int *cuda_nclusters,
int *cuda_natoms,
int Nsclusters_local, MD_FLOAT dtforce, MD_FLOAT dt);
extern __global__ void cudaFinalIntegrateSup_warp(MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
int *cuda_nclusters, int *cuda_natoms,
int Nsclusters_local, MD_FLOAT dtforce);
extern "C" extern "C"
void initDevice(Atom *atom, Neighbor *neighbor) { void initDevice(Atom *atom, Neighbor *neighbor) {
cuda_assert("cudaDeviceSetup", cudaDeviceReset()); cuda_assert("cudaDeviceSetup", cudaDeviceReset());
@ -59,10 +80,23 @@ void initDevice(Atom *atom, Neighbor *neighbor) {
natoms = (int *) malloc(atom->Nclusters_max * sizeof(int)); natoms = (int *) malloc(atom->Nclusters_max * sizeof(int));
ngatoms = (int *) malloc(atom->Nclusters_max * sizeof(int)); ngatoms = (int *) malloc(atom->Nclusters_max * sizeof(int));
isReneighboured = 1; isReneighboured = 1;
#ifdef USE_SUPER_CLUSTERS
cuda_max_scl = atom->Nsclusters_max;
cuda_iclusters = (int *) allocateGPU(atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int));
cuda_nclusters = (int *) allocateGPU(atom->Nsclusters_max * sizeof(int));
cuda_scl_x = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
cuda_scl_v = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
cuda_scl_f = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
#endif //USE_SUPER_CLUSTERS
} }
extern "C" extern "C"
void copyDataToCUDADevice(Atom *atom) { void copyDataToCUDADevice(Atom *atom) {
DEBUG_MESSAGE("copyDataToCUDADevice start\r\n");
memcpyToGPU(cuda_cl_x, atom->cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT)); memcpyToGPU(cuda_cl_x, atom->cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
memcpyToGPU(cuda_cl_v, atom->cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT)); memcpyToGPU(cuda_cl_v, atom->cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
memcpyToGPU(cuda_cl_f, atom->cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT)); memcpyToGPU(cuda_cl_f, atom->cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
@ -85,13 +119,49 @@ void copyDataToCUDADevice(Atom *atom) {
memcpyToGPU(cuda_PBCx, atom->PBCx, atom->Nclusters_ghost * sizeof(int)); memcpyToGPU(cuda_PBCx, atom->PBCx, atom->Nclusters_ghost * sizeof(int));
memcpyToGPU(cuda_PBCy, atom->PBCy, atom->Nclusters_ghost * sizeof(int)); memcpyToGPU(cuda_PBCy, atom->PBCy, atom->Nclusters_ghost * sizeof(int));
memcpyToGPU(cuda_PBCz, atom->PBCz, atom->Nclusters_ghost * sizeof(int)); memcpyToGPU(cuda_PBCz, atom->PBCz, atom->Nclusters_ghost * sizeof(int));
#ifdef USE_SUPER_CLUSTERS
alignDataToSuperclusters(atom);
if (cuda_max_scl < atom->Nsclusters_max) {
cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_x));
cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_v));
cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_f));
cuda_max_scl = atom->Nsclusters_max;
cuda_iclusters = (int *) allocateGPU(atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int));
cuda_nclusters = (int *) allocateGPU(atom->Nsclusters_max * sizeof(int));
cuda_scl_x = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
cuda_scl_v = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
cuda_scl_f = (MD_FLOAT *) allocateGPU(atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
}
memcpyToGPU(cuda_scl_x, atom->scl_x, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
memcpyToGPU(cuda_scl_v, atom->scl_v, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
memcpyToGPU(cuda_scl_f, atom->scl_f, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
#endif //USE_SUPER_CLUSTERS
DEBUG_MESSAGE("copyDataToCUDADevice stop\r\n");
} }
extern "C" extern "C"
void copyDataFromCUDADevice(Atom *atom) { void copyDataFromCUDADevice(Atom *atom) {
DEBUG_MESSAGE("copyDataFromCUDADevice start\r\n");
memcpyFromGPU(atom->cl_x, cuda_cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT)); memcpyFromGPU(atom->cl_x, cuda_cl_x, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
memcpyFromGPU(atom->cl_v, cuda_cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT)); memcpyFromGPU(atom->cl_v, cuda_cl_v, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
memcpyFromGPU(atom->cl_f, cuda_cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT)); memcpyFromGPU(atom->cl_f, cuda_cl_f, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
#ifdef USE_SUPER_CLUSTERS
alignDataFromSuperclusters(atom);
memcpyFromGPU(atom->scl_x, cuda_scl_x, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
memcpyFromGPU(atom->scl_v, cuda_scl_v, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
memcpyFromGPU(atom->scl_f, cuda_scl_f, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT));
#endif //USE_SUPER_CLUSTERS
DEBUG_MESSAGE("copyDataFromCUDADevice stop\r\n");
} }
extern "C" extern "C"
@ -109,6 +179,12 @@ void cudaDeviceFree() {
cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCz)); cuda_assert("cudaDeviceFree", cudaFree(cuda_PBCz));
free(natoms); free(natoms);
free(ngatoms); free(ngatoms);
#ifdef USE_SUPER_CLUSTERS
cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_x));
cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_v));
cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_f));
#endif //USE_SUPER_CLUSTERS
} }
__global__ void cudaInitialIntegrate_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f, __global__ void cudaInitialIntegrate_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
@ -251,9 +327,17 @@ extern "C"
void cudaInitialIntegrate(Parameter *param, Atom *atom) { void cudaInitialIntegrate(Parameter *param, Atom *atom) {
const int threads_num = 16; const int threads_num = 16;
dim3 block_size = dim3(threads_num, 1, 1); dim3 block_size = dim3(threads_num, 1, 1);
#ifdef USE_SUPER_CLUSTERS
dim3 grid_size = dim3(atom->Nsclusters_local/(threads_num)+1, 1, 1);
cudaInitialIntegrateSup_warp<<<grid_size, block_size>>>(cuda_scl_x, cuda_scl_v, cuda_scl_f,
cuda_nclusters,
cuda_natoms, atom->Nsclusters_local, param->dtforce, param->dt);
#else
dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1); dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1);
cudaInitialIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_cl_v, cuda_cl_f, cudaInitialIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_x, cuda_cl_v, cuda_cl_f,
cuda_natoms, atom->Nclusters_local, param->dtforce, param->dt); cuda_natoms, atom->Nclusters_local, param->dtforce, param->dt);
#endif //USE_SUPER_CLUSTERS
cuda_assert("cudaInitialIntegrate", cudaPeekAtLastError()); cuda_assert("cudaInitialIntegrate", cudaPeekAtLastError());
cuda_assert("cudaInitialIntegrate", cudaDeviceSynchronize()); cuda_assert("cudaInitialIntegrate", cudaDeviceSynchronize());
} }
@ -310,8 +394,17 @@ extern "C"
void cudaFinalIntegrate(Parameter *param, Atom *atom) { void cudaFinalIntegrate(Parameter *param, Atom *atom) {
const int threads_num = 16; const int threads_num = 16;
dim3 block_size = dim3(threads_num, 1, 1); dim3 block_size = dim3(threads_num, 1, 1);
#ifdef USE_SUPER_CLUSTERS
dim3 grid_size = dim3(atom->Nsclusters_local/(threads_num)+1, 1, 1);
cudaFinalIntegrateSup_warp<<<grid_size, block_size>>>(cuda_scl_v, cuda_scl_f,
cuda_nclusters, cuda_natoms,
atom->Nsclusters_local, param->dt);
#else
dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1); dim3 grid_size = dim3(atom->Nclusters_local/(threads_num)+1, 1, 1);
cudaFinalIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_v, cuda_cl_f, cuda_natoms, atom->Nclusters_local, param->dt); cudaFinalIntegrate_warp<<<grid_size, block_size>>>(cuda_cl_v, cuda_cl_f, cuda_natoms,
atom->Nclusters_local, param->dt);
#endif //USE_SUPER_CLUSTERS
cuda_assert("cudaFinalIntegrate", cudaPeekAtLastError()); cuda_assert("cudaFinalIntegrate", cudaPeekAtLastError());
cuda_assert("cudaFinalIntegrate", cudaDeviceSynchronize()); cuda_assert("cudaFinalIntegrate", cudaDeviceSynchronize());
} }

View File

@ -0,0 +1,290 @@
extern "C" {
#include <stdio.h>
//---
#include <cuda.h>
#include <driver_types.h>
//---
#include <likwid-marker.h>
//---
#include <atom.h>
#include <device.h>
#include <neighbor.h>
#include <parameter.h>
#include <stats.h>
#include <timing.h>
#include <util.h>
}
extern "C" {
extern MD_FLOAT *cuda_cl_x;
extern MD_FLOAT *cuda_cl_v;
extern MD_FLOAT *cuda_cl_f;
extern int *cuda_neighbors;
extern int *cuda_numneigh;
extern int *cuda_natoms;
extern int *natoms;
extern int *ngatoms;
extern int *cuda_border_map;
extern int *cuda_jclusters_natoms;
extern MD_FLOAT *cuda_bbminx, *cuda_bbmaxx;
extern MD_FLOAT *cuda_bbminy, *cuda_bbmaxy;
extern MD_FLOAT *cuda_bbminz, *cuda_bbmaxz;
extern int *cuda_PBCx, *cuda_PBCy, *cuda_PBCz;
extern int isReneighboured;
extern int *cuda_iclusters;
extern int *cuda_nclusters;
extern MD_FLOAT *cuda_scl_x;
extern MD_FLOAT *cuda_scl_v;
extern MD_FLOAT *cuda_scl_f;
}
#ifdef USE_SUPER_CLUSTERS
extern "C"
void alignDataToSuperclusters(Atom *atom) {
for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
const unsigned int scl_offset = sci * SCLUSTER_SIZE * 3 * CLUSTER_M;
for (int ci = 0, scci = scl_offset; ci < atom->siclusters[sci].nclusters; ci++, scci += CLUSTER_M) {
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
/*
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
*/
memcpy(&atom->scl_x[scci], &ci_x[0], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&atom->scl_x[scci + SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&atom->scl_x[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&atom->scl_v[scci], &ci_v[0], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&atom->scl_v[scci + SCLUSTER_SIZE * CLUSTER_M], &ci_v[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&atom->scl_v[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_v[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&atom->scl_f[scci], &ci_f[0], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&atom->scl_f[scci + SCLUSTER_SIZE * CLUSTER_M], &ci_f[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&atom->scl_f[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_f[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
}
}
}
extern "C"
void alignDataFromSuperclusters(Atom *atom) {
for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
const unsigned int scl_offset = sci * SCLUSTER_SIZE * 3 * CLUSTER_M;
for (int ci = 0, scci = scl_offset; ci < atom->siclusters[sci].nclusters; ci++, scci += CLUSTER_M) {
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->icluster_idx[SCLUSTER_SIZE * sci + ci])];
/*
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
MD_FLOAT *ci_v = &atom->cl_v[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
MD_FLOAT *ci_f = &atom->cl_f[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
*/
memcpy(&ci_x[0], &atom->scl_x[scci], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&ci_x[0 + CLUSTER_M], &atom->scl_x[scci + SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&ci_x[0 + 2 * CLUSTER_M], &atom->scl_x[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&ci_v[0], &atom->scl_v[scci], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&ci_v[0 + CLUSTER_M], &atom->scl_v[scci + SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&ci_v[0 + 2 * CLUSTER_M], &atom->scl_v[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&ci_f[0], &atom->scl_f[scci], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&ci_f[0 + CLUSTER_M], &atom->scl_f[scci + SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&ci_f[0 + 2 * CLUSTER_M], &atom->scl_f[scci + 2 * SCLUSTER_SIZE * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
}
}
}
__global__ void cudaInitialIntegrateSup_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
int *cuda_nclusters,
int *cuda_natoms,
int Nsclusters_local, MD_FLOAT dtforce, MD_FLOAT dt) {
unsigned int sci_pos = blockDim.x * blockIdx.x + threadIdx.x;
//unsigned int cii_pos = blockDim.y * blockIdx.y + threadIdx.y;
if (sci_pos >= Nsclusters_local) return;
//unsigned int ci_pos = cii_pos / CLUSTER_M;
//unsigned int scii_pos = cii_pos % CLUSTER_M;
//if (ci_pos >= cuda_nclusters[sci_pos]) return;
//if (scii_pos >= cuda_natoms[ci_pos]) return;
int ci_vec_base = SCI_VECTOR_BASE_INDEX(sci_pos);
MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];
MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];
MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
for (int scii_pos = 0; scii_pos < SCLUSTER_M; scii_pos++) {
ci_v[SCL_X_OFFSET + scii_pos] += dtforce * ci_f[SCL_X_OFFSET + scii_pos];
ci_v[SCL_Y_OFFSET + scii_pos] += dtforce * ci_f[SCL_Y_OFFSET + scii_pos];
ci_v[SCL_Z_OFFSET + scii_pos] += dtforce * ci_f[SCL_Z_OFFSET + scii_pos];
ci_x[SCL_X_OFFSET + scii_pos] += dt * ci_v[SCL_X_OFFSET + scii_pos];
ci_x[SCL_Y_OFFSET + scii_pos] += dt * ci_v[SCL_Y_OFFSET + scii_pos];
ci_x[SCL_Z_OFFSET + scii_pos] += dt * ci_v[SCL_Z_OFFSET + scii_pos];
}
}
__global__ void cudaFinalIntegrateSup_warp(MD_FLOAT *cuda_cl_v, MD_FLOAT *cuda_cl_f,
int *cuda_nclusters, int *cuda_natoms,
int Nsclusters_local, MD_FLOAT dtforce) {
unsigned int sci_pos = blockDim.x * blockIdx.x + threadIdx.x;
//unsigned int cii_pos = blockDim.y * blockIdx.y + threadIdx.y;
if (sci_pos >= Nsclusters_local) return;
//unsigned int ci_pos = cii_pos / CLUSTER_M;
//unsigned int scii_pos = cii_pos % CLUSTER_M;
//if (ci_pos >= cuda_nclusters[sci_pos]) return;
//if (scii_pos >= cuda_natoms[ci_pos]) return;
int ci_vec_base = SCI_VECTOR_BASE_INDEX(sci_pos);
MD_FLOAT *ci_v = &cuda_cl_v[ci_vec_base];
MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
for (int scii_pos = 0; scii_pos < SCLUSTER_M; scii_pos++) {
ci_v[SCL_X_OFFSET + scii_pos] += dtforce * ci_f[SCL_X_OFFSET + scii_pos];
ci_v[SCL_Y_OFFSET + scii_pos] += dtforce * ci_f[SCL_Y_OFFSET + scii_pos];
ci_v[SCL_Z_OFFSET + scii_pos] += dtforce * ci_f[SCL_Z_OFFSET + scii_pos];
}
}
__global__ void computeForceLJSup_cuda_warp(MD_FLOAT *cuda_cl_x, MD_FLOAT *cuda_cl_f,
int *cuda_nclusters, int *cuda_iclusters,
int Nsclusters_local,
int *cuda_numneigh, int *cuda_neighs, int half_neigh, int maxneighs,
MD_FLOAT cutforcesq, MD_FLOAT sigma6, MD_FLOAT epsilon) {
unsigned int sci_pos = blockDim.x * blockIdx.x + threadIdx.x;
unsigned int scii_pos = blockDim.y * blockIdx.y + threadIdx.y;
unsigned int cjj_pos = blockDim.z * blockIdx.z + threadIdx.z;
if ((sci_pos >= Nsclusters_local) || (scii_pos >= SCLUSTER_M) || (cjj_pos >= CLUSTER_N)) return;
unsigned int ci_pos = scii_pos / CLUSTER_M;
unsigned int cii_pos = scii_pos % CLUSTER_M;
if (ci_pos >= cuda_nclusters[sci_pos]) return;
int ci_cj0 = CJ0_FROM_CI(ci_pos);
int ci_vec_base = SCI_VECTOR_BASE_INDEX(sci_pos);
MD_FLOAT *ci_x = &cuda_cl_x[ci_vec_base];
MD_FLOAT *ci_f = &cuda_cl_f[ci_vec_base];
//int numneighs = cuda_numneigh[ci_pos];
int numneighs = cuda_numneigh[cuda_iclusters[SCLUSTER_SIZE * sci_pos + ci_pos]];
for(int k = 0; k < numneighs; k++) {
int cj = (&cuda_neighs[cuda_iclusters[SCLUSTER_SIZE * sci_pos + ci_pos] * maxneighs])[k];
// TODO Make cj accessible from super cluster data alignment (not reachable right now)
int cj_vec_base = SCJ_VECTOR_BASE_INDEX(cj);
MD_FLOAT *cj_x = &cuda_cl_x[cj_vec_base];
MD_FLOAT *cj_f = &cuda_cl_f[cj_vec_base];
MD_FLOAT xtmp = ci_x[SCL_CL_X_OFFSET(ci_pos) + cii_pos];
MD_FLOAT ytmp = ci_x[SCL_CL_Y_OFFSET(ci_pos) + cii_pos];
MD_FLOAT ztmp = ci_x[SCL_CL_Z_OFFSET(ci_pos) + cii_pos];
MD_FLOAT fix = 0;
MD_FLOAT fiy = 0;
MD_FLOAT fiz = 0;
int cond;
#if CLUSTER_M == CLUSTER_N
cond = half_neigh ? (ci_cj0 != cj || cii_pos < cjj_pos) :
(ci_cj0 != cj || cii_pos != cjj_pos);
#elif CLUSTER_M < CLUSTER_N
cond = half_neigh ? (ci_cj0 != cj || cii_pos + CLUSTER_M * (ci_pos & 0x1) < cjj_pos) :
(ci_cj0 != cj || cii_pos + CLUSTER_M * (ci_pos & 0x1) != cjj_pos);
#endif
if(cond) {
MD_FLOAT delx = xtmp - cj_x[SCL_CL_X_OFFSET(ci_pos) + cjj_pos];
MD_FLOAT dely = ytmp - cj_x[SCL_CL_Y_OFFSET(ci_pos) + cjj_pos];
MD_FLOAT delz = ztmp - cj_x[SCL_CL_Z_OFFSET(ci_pos) + cjj_pos];
MD_FLOAT rsq = delx * delx + dely * dely + delz * delz;
if(rsq < cutforcesq) {
MD_FLOAT sr2 = 1.0 / rsq;
MD_FLOAT sr6 = sr2 * sr2 * sr2 * sigma6;
MD_FLOAT force = 48.0 * sr6 * (sr6 - 0.5) * sr2 * epsilon;
if(half_neigh) {
atomicAdd(&cj_f[SCL_CL_X_OFFSET(ci_pos) + cjj_pos], -delx * force);
atomicAdd(&cj_f[SCL_CL_Y_OFFSET(ci_pos) + cjj_pos], -dely * force);
atomicAdd(&cj_f[SCL_CL_Z_OFFSET(ci_pos) + cjj_pos], -delz * force);
}
fix += delx * force;
fiy += dely * force;
fiz += delz * force;
atomicAdd(&ci_f[SCL_CL_X_OFFSET(ci_pos) + cii_pos], fix);
atomicAdd(&ci_f[SCL_CL_Y_OFFSET(ci_pos) + cii_pos], fiy);
atomicAdd(&ci_f[SCL_CL_Z_OFFSET(ci_pos) + cii_pos], fiz);
}
}
}
}
extern "C"
double computeForceLJSup_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats) {
DEBUG_MESSAGE("computeForceLJSup_cuda start\r\n");
MD_FLOAT cutforcesq = param->cutforce * param->cutforce;
MD_FLOAT sigma6 = param->sigma6;
MD_FLOAT epsilon = param->epsilon;
memsetGPU(cuda_cl_f, 0, atom->Nclusters_max * CLUSTER_M * 3 * sizeof(MD_FLOAT));
if (isReneighboured) {
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
memcpyToGPU(&cuda_numneigh[ci], &neighbor->numneigh[ci], sizeof(int));
memcpyToGPU(&cuda_neighbors[ci * neighbor->maxneighs], &neighbor->neighbors[ci * neighbor->maxneighs], neighbor->numneigh[ci] * sizeof(int));
}
for(int sci = 0; sci < atom->Nsclusters_local; sci++) {
memcpyToGPU(&cuda_nclusters[sci], &atom->siclusters[sci].nclusters, sizeof(int));
//memcpyToGPU(&cuda_iclusters[sci * SCLUSTER_SIZE], &atom->siclusters[sci].iclusters, sizeof(int) * atom->siclusters[sci].nclusters);
}
memcpyToGPU(cuda_iclusters, atom->icluster_idx, atom->Nsclusters_max * SCLUSTER_SIZE * sizeof(int));
isReneighboured = 0;
}
const int threads_num = 1;
dim3 block_size = dim3(threads_num, SCLUSTER_M, CLUSTER_N);
dim3 grid_size = dim3(atom->Nsclusters_local/threads_num+1, 1, 1);
double S = getTimeStamp();
LIKWID_MARKER_START("force");
computeForceLJSup_cuda_warp<<<grid_size, block_size>>>(cuda_scl_x, cuda_scl_f,
cuda_nclusters, cuda_iclusters,
atom->Nsclusters_local,
cuda_numneigh, cuda_neighbors,
neighbor->half_neigh, neighbor->maxneighs, cutforcesq,
sigma6, epsilon);
cuda_assert("computeForceLJ_cuda", cudaPeekAtLastError());
cuda_assert("computeForceLJ_cuda", cudaDeviceSynchronize());
LIKWID_MARKER_STOP("force");
double E = getTimeStamp();
DEBUG_MESSAGE("computeForceLJSup_cuda stop\r\n");
return E-S;
}
#endif //USE_SUPER_CLUSTERS

View File

@ -22,7 +22,25 @@
# define KERNEL_NAME "CUDA" # define KERNEL_NAME "CUDA"
# define CLUSTER_M 8 # define CLUSTER_M 8
# define CLUSTER_N VECTOR_WIDTH # define CLUSTER_N VECTOR_WIDTH
#ifdef USE_SUPER_CLUSTERS
# define XX 0
# define YY 1
# define ZZ 2
# define SCLUSTER_SIZE_X 2
# define SCLUSTER_SIZE_Y 2
# define SCLUSTER_SIZE_Z 2
# define SCLUSTER_SIZE (SCLUSTER_SIZE_X * SCLUSTER_SIZE_Y * SCLUSTER_SIZE_Z)
# define DIM_COORD(dim,coord) ((dim == XX) ? atom_x(coord) : ((dim == YY) ? atom_y(coord) : atom_z(coord)))
# define MIN(a,b) ({int _a = (a), _b = (b); _a < _b ? _a : _b; })
# define SCLUSTER_M CLUSTER_M * SCLUSTER_SIZE
# define computeForceLJ computeForceLJSup_cuda
#else
# define computeForceLJ computeForceLJ_cuda # define computeForceLJ computeForceLJ_cuda
#endif //USE_SUPER_CLUSTERS
# define initialIntegrate cudaInitialIntegrate # define initialIntegrate cudaInitialIntegrate
# define finalIntegrate cudaFinalIntegrate # define finalIntegrate cudaFinalIntegrate
# define updatePbc cudaUpdatePbc # define updatePbc cudaUpdatePbc
@ -55,16 +73,28 @@
# define CJ1_FROM_CI(a) (a) # define CJ1_FROM_CI(a) (a)
# define CI_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b)) # define CI_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
# define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b)) # define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
#ifdef USE_SUPER_CLUSTERS
# define SCI_BASE_INDEX(a,b) ((a) * CLUSTER_N * SCLUSTER_SIZE * (b))
# define SCJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * SCLUSTER_SIZE * (b))
#endif //USE_SUPER_CLUSTERS
#elif CLUSTER_M == CLUSTER_N * 2 // M > N #elif CLUSTER_M == CLUSTER_N * 2 // M > N
# define CJ0_FROM_CI(a) ((a) << 1) # define CJ0_FROM_CI(a) ((a) << 1)
# define CJ1_FROM_CI(a) (((a) << 1) | 0x1) # define CJ1_FROM_CI(a) (((a) << 1) | 0x1)
# define CI_BASE_INDEX(a,b) ((a) * CLUSTER_M * (b)) # define CI_BASE_INDEX(a,b) ((a) * CLUSTER_M * (b))
# define CJ_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_M * (b) + ((a) & 0x1) * (CLUSTER_M >> 1)) # define CJ_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_M * (b) + ((a) & 0x1) * (CLUSTER_M >> 1))
#ifdef USE_SUPER_CLUSTERS
# define SCI_BASE_INDEX(a,b) ((a) * CLUSTER_M * SCLUSTER_SIZE * (b))
# define SCJ_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_M * SCLUSTER_SIZE * (b) + ((a) & 0x1) * (SCLUSTER_SIZE * CLUSTER_M >> 1))
#endif //USE_SUPER_CLUSTERS
#elif CLUSTER_M == CLUSTER_N / 2 // M < N #elif CLUSTER_M == CLUSTER_N / 2 // M < N
# define CJ0_FROM_CI(a) ((a) >> 1) # define CJ0_FROM_CI(a) ((a) >> 1)
# define CJ1_FROM_CI(a) ((a) >> 1) # define CJ1_FROM_CI(a) ((a) >> 1)
# define CI_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_N * (b) + ((a) & 0x1) * (CLUSTER_N >> 1)) # define CI_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_N * (b) + ((a) & 0x1) * (CLUSTER_N >> 1))
# define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b)) # define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b))
#ifdef USE_SUPER_CLUSTERS
# define SCI_BASE_INDEX(a,b) (((a) >> 1) * CLUSTER_N * SCLUSTER_SIZE * (b) + ((a) & 0x1) * (CLUSTER_N * SCLUSTER_SIZE >> 1))
# define SCJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * SCLUSTER_SIZE * (b))
#endif //USE_SUPER_CLUSTERS
#else #else
# error "Invalid cluster configuration!" # error "Invalid cluster configuration!"
#endif #endif
@ -78,14 +108,37 @@
#define CJ_SCALAR_BASE_INDEX(a) (CJ_BASE_INDEX(a, 1)) #define CJ_SCALAR_BASE_INDEX(a) (CJ_BASE_INDEX(a, 1))
#define CJ_VECTOR_BASE_INDEX(a) (CJ_BASE_INDEX(a, 3)) #define CJ_VECTOR_BASE_INDEX(a) (CJ_BASE_INDEX(a, 3))
#ifdef USE_SUPER_CLUSTERS
#define SCI_SCALAR_BASE_INDEX(a) (SCI_BASE_INDEX(a, 1))
#define SCI_VECTOR_BASE_INDEX(a) (SCI_BASE_INDEX(a, 3))
#define SCJ_SCALAR_BASE_INDEX(a) (SCJ_BASE_INDEX(a, 1))
#define SCJ_VECTOR_BASE_INDEX(a) (SCJ_BASE_INDEX(a, 3))
#endif //USE_SUPER_CLUSTERS
#if CLUSTER_M >= CLUSTER_N #if CLUSTER_M >= CLUSTER_N
# define CL_X_OFFSET (0 * CLUSTER_M) # define CL_X_OFFSET (0 * CLUSTER_M)
# define CL_Y_OFFSET (1 * CLUSTER_M) # define CL_Y_OFFSET (1 * CLUSTER_M)
# define CL_Z_OFFSET (2 * CLUSTER_M) # define CL_Z_OFFSET (2 * CLUSTER_M)
#ifdef USE_SUPER_CLUSTERS
# define SCL_CL_X_OFFSET(ci) (ci * CLUSTER_M + 0 * SCLUSTER_M)
# define SCL_CL_Y_OFFSET(ci) (ci * CLUSTER_M + 1 * SCLUSTER_M)
# define SCL_CL_Z_OFFSET(ci) (ci * CLUSTER_M + 2 * SCLUSTER_M)
# define SCL_X_OFFSET (0 * SCLUSTER_M)
# define SCL_Y_OFFSET (1 * SCLUSTER_M)
# define SCL_Z_OFFSET (2 * SCLUSTER_M)
#endif //USE_SUPER_CLUSTERS
#else #else
# define CL_X_OFFSET (0 * CLUSTER_N) # define CL_X_OFFSET (0 * CLUSTER_N)
# define CL_Y_OFFSET (1 * CLUSTER_N) # define CL_Y_OFFSET (1 * CLUSTER_N)
# define CL_Z_OFFSET (2 * CLUSTER_N) # define CL_Z_OFFSET (2 * CLUSTER_N)
#ifdef USE_SUPER_CLUSTERS
# define SCL_X_OFFSET (0 * SCLUSTER_SIZE * CLUSTER_N)
# define SCL_Y_OFFSET (1 * SCLUSTER_SIZE * CLUSTER_N)
# define SCL_Z_OFFSET (2 * SCLUSTER_SIZE * CLUSTER_N)
#endif //USE_SUPER_CLUSTERS
#endif #endif
typedef struct { typedef struct {
@ -95,6 +148,14 @@ typedef struct {
MD_FLOAT bbminz, bbmaxz; MD_FLOAT bbminz, bbmaxz;
} Cluster; } Cluster;
typedef struct {
//int *iclusters;
int nclusters;
MD_FLOAT bbminx, bbmaxx;
MD_FLOAT bbminy, bbmaxy;
MD_FLOAT bbminz, bbmaxz;
} SuperCluster;
typedef struct { typedef struct {
int Natoms, Nlocal, Nghost, Nmax; int Natoms, Nlocal, Nghost, Nmax;
int Nclusters, Nclusters_local, Nclusters_ghost, Nclusters_max; int Nclusters, Nclusters_local, Nclusters_ghost, Nclusters_max;
@ -116,6 +177,17 @@ typedef struct {
Cluster *iclusters, *jclusters; Cluster *iclusters, *jclusters;
int *icluster_bin; int *icluster_bin;
int dummy_cj; int dummy_cj;
#ifdef USE_SUPER_CLUSTERS
int Nsclusters, Nsclusters_local, Nsclusters_ghost, Nsclusters_max;
MD_FLOAT *scl_x;
MD_FLOAT *scl_v;
MD_FLOAT *scl_f;
int *scl_type;
int *icluster_idx;
SuperCluster *siclusters;
int *sicluster_bin;
#endif //USE_SUPER_CLUSTERS
} Atom; } Atom;
extern void initAtom(Atom*); extern void initAtom(Atom*);
@ -126,6 +198,7 @@ extern int readAtom_gro(Atom*, Parameter*);
extern int readAtom_dmp(Atom*, Parameter*); extern int readAtom_dmp(Atom*, Parameter*);
extern void growAtom(Atom*); extern void growAtom(Atom*);
extern void growClusters(Atom*); extern void growClusters(Atom*);
extern void growSuperClusters(Atom*);
#ifdef AOS #ifdef AOS
# define POS_DATA_LAYOUT "AoS" # define POS_DATA_LAYOUT "AoS"

View File

@ -25,6 +25,7 @@ extern void buildNeighbor(Atom*, Neighbor*);
extern void pruneNeighbor(Parameter*, Atom*, Neighbor*); extern void pruneNeighbor(Parameter*, Atom*, Neighbor*);
extern void sortAtom(Atom*); extern void sortAtom(Atom*);
extern void buildClusters(Atom*); extern void buildClusters(Atom*);
extern void buildClustersGPU(Atom*);
extern void defineJClusters(Atom*); extern void defineJClusters(Atom*);
extern void binClusters(Atom*); extern void binClusters(Atom*);
extern void updateSingleAtoms(Atom*); extern void updateSingleAtoms(Atom*);

17
gromacs/includes/utils.h Normal file
View File

@ -0,0 +1,17 @@
/*
* Temporal functions for debugging, remove before proceeding with pull request
*/
#ifndef MD_BENCH_UTILS_H
#define MD_BENCH_UTILS_H
#include <atom.h>
#ifdef USE_SUPER_CLUSTERS
void verifyClusters(Atom *atom);
void verifyLayout(Atom *atom);
void checkAlignment(Atom *atom);
void showSuperclusters(Atom *atom);
#endif //USE_SUPER_CLUSTERS
#endif //MD_BENCH_UTILS_H

View File

@ -9,6 +9,7 @@
#ifndef __VTK_H_ #ifndef __VTK_H_
#define __VTK_H_ #define __VTK_H_
extern void write_data_to_vtk_file(const char *filename, Atom* atom, int timestep); extern void write_data_to_vtk_file(const char *filename, Atom* atom, int timestep);
extern int write_super_clusters_to_vtk_file(const char* filename, Atom* atom, int timestep);
extern int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep); extern int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
extern int write_ghost_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep); extern int write_ghost_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep);
extern int write_local_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep); extern int write_local_cluster_edges_to_vtk_file(const char* filename, Atom* atom, int timestep);

View File

@ -38,7 +38,16 @@ extern double computeForceLJ_cuda(Parameter *param, Atom *atom, Neighbor *neighb
extern void copyDataToCUDADevice(Atom *atom); extern void copyDataToCUDADevice(Atom *atom);
extern void copyDataFromCUDADevice(Atom *atom); extern void copyDataFromCUDADevice(Atom *atom);
extern void cudaDeviceFree(); extern void cudaDeviceFree();
#endif
#ifdef USE_SUPER_CLUSTERS
#include <utils.h>
extern void buildNeighborGPU(Atom *atom, Neighbor *neighbor);
extern void pruneNeighborGPU(Parameter *param, Atom *atom, Neighbor *neighbor);
extern void alignDataToSuperclusters(Atom *atom);
extern void alignDataFromSuperclusters(Atom *atom);
extern double computeForceLJSup_cuda(Parameter *param, Atom *atom, Neighbor *neighbor, Stats *stats);
#endif //USE_SUPER_CLUSTERS
#endif //CUDA_TARGET
double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) { double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *stats) {
if(param->force_field == FF_EAM) { initEam(eam, param); } if(param->force_field == FF_EAM) { initEam(eam, param); }
@ -62,11 +71,19 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats *
setupNeighbor(param, atom); setupNeighbor(param, atom);
setupThermo(param, atom->Natoms); setupThermo(param, atom->Natoms);
if(param->input_file == NULL) { adjustThermo(param, atom); } if(param->input_file == NULL) { adjustThermo(param, atom); }
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
buildClustersGPU(atom);
#else
buildClusters(atom); buildClusters(atom);
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
defineJClusters(atom); defineJClusters(atom);
setupPbc(atom, param); setupPbc(atom, param);
binClusters(atom); binClusters(atom);
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
buildNeighborGPU(atom, neighbor);
#else
buildNeighbor(atom, neighbor); buildNeighbor(atom, neighbor);
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
initDevice(atom, neighbor); initDevice(atom, neighbor);
E = getTimeStamp(); E = getTimeStamp();
return E-S; return E-S;
@ -78,11 +95,19 @@ double reneighbour(Parameter *param, Atom *atom, Neighbor *neighbor) {
LIKWID_MARKER_START("reneighbour"); LIKWID_MARKER_START("reneighbour");
updateSingleAtoms(atom); updateSingleAtoms(atom);
updateAtomsPbc(atom, param); updateAtomsPbc(atom, param);
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
buildClustersGPU(atom);
#else
buildClusters(atom); buildClusters(atom);
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
defineJClusters(atom); defineJClusters(atom);
setupPbc(atom, param); setupPbc(atom, param);
binClusters(atom); binClusters(atom);
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
buildNeighborGPU(atom, neighbor);
#else
buildNeighbor(atom, neighbor); buildNeighbor(atom, neighbor);
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
LIKWID_MARKER_STOP("reneighbour"); LIKWID_MARKER_STOP("reneighbour");
E = getTimeStamp(); E = getTimeStamp();
return E-S; return E-S;
@ -237,11 +262,18 @@ int main(int argc, char** argv) {
} }
for(int n = 0; n < param.ntimes; n++) { for(int n = 0; n < param.ntimes; n++) {
//printf("Step:\t%d\r\n", n);
initialIntegrate(&param, &atom); initialIntegrate(&param, &atom);
if((n + 1) % param.reneigh_every) { if((n + 1) % param.reneigh_every) {
if(!((n + 1) % param.prune_every)) { if(!((n + 1) % param.prune_every)) {
#if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
pruneNeighborGPU(&param, &atom, &neighbor);
#else
pruneNeighbor(&param, &atom, &neighbor); pruneNeighbor(&param, &atom, &neighbor);
#endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS)
} }
updatePbc(&atom, &param, 0); updatePbc(&atom, &param, 0);

View File

@ -77,8 +77,13 @@ void setupNeighbor(Parameter *param, Atom *atom) {
MD_FLOAT atom_density = ((MD_FLOAT)(atom->Nlocal)) / ((xhi - xlo) * (yhi - ylo) * (zhi - zlo)); MD_FLOAT atom_density = ((MD_FLOAT)(atom->Nlocal)) / ((xhi - xlo) * (yhi - ylo) * (zhi - zlo));
MD_FLOAT atoms_in_cell = MAX(CLUSTER_M, CLUSTER_N); MD_FLOAT atoms_in_cell = MAX(CLUSTER_M, CLUSTER_N);
#ifdef USE_SUPER_CLUSTERS
MD_FLOAT targetsizex = cbrt(atoms_in_cell / atom_density) * (MD_FLOAT)SCLUSTER_SIZE_X;
MD_FLOAT targetsizey = cbrt(atoms_in_cell / atom_density) * (MD_FLOAT)SCLUSTER_SIZE_Y;
#else
MD_FLOAT targetsizex = cbrt(atoms_in_cell / atom_density); MD_FLOAT targetsizex = cbrt(atoms_in_cell / atom_density);
MD_FLOAT targetsizey = cbrt(atoms_in_cell / atom_density); MD_FLOAT targetsizey = cbrt(atoms_in_cell / atom_density);
#endif
nbinx = MAX(1, (int)ceil((xhi - xlo) / targetsizex)); nbinx = MAX(1, (int)ceil((xhi - xlo) / targetsizex));
nbiny = MAX(1, (int)ceil((yhi - ylo) / targetsizey)); nbiny = MAX(1, (int)ceil((yhi - ylo) / targetsizey));
binsizex = (xhi - xlo) / nbinx; binsizex = (xhi - xlo) / nbinx;
@ -364,6 +369,197 @@ void buildNeighbor(Atom *atom, Neighbor *neighbor) {
DEBUG_MESSAGE("buildNeighbor end\n"); DEBUG_MESSAGE("buildNeighbor end\n");
} }
#ifdef USE_SUPER_CLUSTERS
// TODO For future parallelization on GPU
void buildNeighborGPU(Atom *atom, Neighbor *neighbor) {
DEBUG_MESSAGE("buildNeighborGPU start\n");
/* extend atom arrays if necessary */
if(atom->Nclusters_local > nmax) {
nmax = atom->Nclusters_local;
//nmax = atom->Nsclusters_local;
if(neighbor->numneigh) free(neighbor->numneigh);
if(neighbor->neighbors) free(neighbor->neighbors);
neighbor->numneigh = (int*) malloc(nmax * sizeof(int));
neighbor->neighbors = (int*) malloc(nmax * neighbor->maxneighs * sizeof(int*));
}
MD_FLOAT bbx = 0.5 * (binsizex + binsizex);
MD_FLOAT bby = 0.5 * (binsizey + binsizey);
MD_FLOAT rbb_sq = MAX(0.0, cutneigh - 0.5 * sqrt(bbx * bbx + bby * bby));
rbb_sq = rbb_sq * rbb_sq;
int resize = 1;
/* loop over each atom, storing neighbors */
while(resize) {
int new_maxneighs = neighbor->maxneighs;
resize = 0;
//for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
//for (int scii = 0; scii < atom->siclusters[sci].nclusters; scii++) {
for(int ci = 0; ci < atom->Nclusters_local; ci++) {
//const int ci = atom->siclusters[sci].iclusters[scii];
//const int ci = atom->icluster_idx[SCLUSTER_SIZE * sci + scii];
int ci_cj1 = CJ1_FROM_CI(ci);
int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
//int *neighptr = &(neighbor->neighbors[sci * neighbor->maxneighs]);
int n = 0;
int ibin = atom->icluster_bin[ci];
MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx;
MD_FLOAT ibb_xmax = atom->iclusters[ci].bbmaxx;
MD_FLOAT ibb_ymin = atom->iclusters[ci].bbminy;
MD_FLOAT ibb_ymax = atom->iclusters[ci].bbmaxy;
MD_FLOAT ibb_zmin = atom->iclusters[ci].bbminz;
MD_FLOAT ibb_zmax = atom->iclusters[ci].bbmaxz;
for(int k = 0; k < nstencil; k++) {
int jbin = ibin + stencil[k];
int *loc_bin = &bin_clusters[jbin * clusters_per_bin];
int cj, m = -1;
MD_FLOAT jbb_xmin, jbb_xmax, jbb_ymin, jbb_ymax, jbb_zmin, jbb_zmax;
const int c = bin_nclusters[jbin];
if(c > 0) {
MD_FLOAT dl, dh, dm, dm0, d_bb_sq;
do {
m++;
cj = loc_bin[m];
if(neighbor->half_neigh && ci_cj1 > cj) {
continue;
}
jbb_zmin = atom->jclusters[cj].bbminz;
jbb_zmax = atom->jclusters[cj].bbmaxz;
dl = ibb_zmin - jbb_zmax;
dh = jbb_zmin - ibb_zmax;
dm = MAX(dl, dh);
dm0 = MAX(dm, 0.0);
d_bb_sq = dm0 * dm0;
} while(m + 1 < c && d_bb_sq > cutneighsq);
jbb_xmin = atom->jclusters[cj].bbminx;
jbb_xmax = atom->jclusters[cj].bbmaxx;
jbb_ymin = atom->jclusters[cj].bbminy;
jbb_ymax = atom->jclusters[cj].bbmaxy;
while(m < c) {
if(!neighbor->half_neigh || ci_cj1 <= cj) {
dl = ibb_zmin - jbb_zmax;
dh = jbb_zmin - ibb_zmax;
dm = MAX(dl, dh);
dm0 = MAX(dm, 0.0);
d_bb_sq = dm0 * dm0;
/*if(d_bb_sq > cutneighsq) {
break;
}*/
dl = ibb_ymin - jbb_ymax;
dh = jbb_ymin - ibb_ymax;
dm = MAX(dl, dh);
dm0 = MAX(dm, 0.0);
d_bb_sq += dm0 * dm0;
dl = ibb_xmin - jbb_xmax;
dh = jbb_xmin - ibb_xmax;
dm = MAX(dl, dh);
dm0 = MAX(dm, 0.0);
d_bb_sq += dm0 * dm0;
if(d_bb_sq < cutneighsq) {
if(d_bb_sq < rbb_sq || atomDistanceInRange(atom, ci, cj, cutneighsq)) {
neighptr[n++] = cj;
}
}
}
m++;
if(m < c) {
cj = loc_bin[m];
jbb_xmin = atom->jclusters[cj].bbminx;
jbb_xmax = atom->jclusters[cj].bbmaxx;
jbb_ymin = atom->jclusters[cj].bbminy;
jbb_ymax = atom->jclusters[cj].bbmaxy;
jbb_zmin = atom->jclusters[cj].bbminz;
jbb_zmax = atom->jclusters[cj].bbmaxz;
}
}
}
}
// Fill neighbor list with dummy values to fit vector width
if(CLUSTER_N < VECTOR_WIDTH) {
while(n % (VECTOR_WIDTH / CLUSTER_N)) {
neighptr[n++] = atom->dummy_cj; // Last cluster is always a dummy cluster
}
}
//neighbor->numneigh[sci] = n;
neighbor->numneigh[ci] = n;
if(n >= neighbor->maxneighs) {
resize = 1;
if(n >= new_maxneighs) {
new_maxneighs = n;
}
}
}
//}
if(resize) {
fprintf(stdout, "RESIZE %d\n", neighbor->maxneighs);
neighbor->maxneighs = new_maxneighs * 1.2;
free(neighbor->neighbors);
neighbor->neighbors = (int*) malloc(atom->Nmax * neighbor->maxneighs * sizeof(int));
}
}
/*
DEBUG_MESSAGE("\ncutneighsq = %f, rbb_sq = %f\n", cutneighsq, rbb_sq);
for(int ci = 0; ci < 6; ci++) {
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
int* neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]);
DEBUG_MESSAGE("Cluster %d, bbx = {%f, %f}, bby = {%f, %f}, bbz = {%f, %f}\n",
ci,
atom->iclusters[ci].bbminx,
atom->iclusters[ci].bbmaxx,
atom->iclusters[ci].bbminy,
atom->iclusters[ci].bbmaxy,
atom->iclusters[ci].bbminz,
atom->iclusters[ci].bbmaxz);
for(int cii = 0; cii < CLUSTER_M; cii++) {
DEBUG_MESSAGE("%f, %f, %f\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
}
DEBUG_MESSAGE("Neighbors:\n");
for(int k = 0; k < neighbor->numneigh[ci]; k++) {
int cj = neighptr[k];
int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj);
MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base];
DEBUG_MESSAGE(" Cluster %d, bbx = {%f, %f}, bby = {%f, %f}, bbz = {%f, %f}\n",
cj,
atom->jclusters[cj].bbminx,
atom->jclusters[cj].bbmaxx,
atom->jclusters[cj].bbminy,
atom->jclusters[cj].bbmaxy,
atom->jclusters[cj].bbminz,
atom->jclusters[cj].bbmaxz);
for(int cjj = 0; cjj < CLUSTER_N; cjj++) {
DEBUG_MESSAGE(" %f, %f, %f\n", cj_x[CL_X_OFFSET + cjj], cj_x[CL_Y_OFFSET + cjj], cj_x[CL_Z_OFFSET + cjj]);
}
}
}
*/
DEBUG_MESSAGE("buildNeighborGPU end\n");
}
#endif //USE_SUPER_CLUSTERS
void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) { void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
DEBUG_MESSAGE("pruneNeighbor start\n"); DEBUG_MESSAGE("pruneNeighbor start\n");
//MD_FLOAT cutsq = param->cutforce * param->cutforce; //MD_FLOAT cutsq = param->cutforce * param->cutforce;
@ -404,6 +600,53 @@ void pruneNeighbor(Parameter *param, Atom *atom, Neighbor *neighbor) {
DEBUG_MESSAGE("pruneNeighbor end\n"); DEBUG_MESSAGE("pruneNeighbor end\n");
} }
#ifdef USE_SUPER_CLUSTERS
void pruneNeighborGPU(Parameter *param, Atom *atom, Neighbor *neighbor) {
DEBUG_MESSAGE("pruneNeighbor start\n");
//MD_FLOAT cutsq = param->cutforce * param->cutforce;
MD_FLOAT cutsq = cutneighsq;
for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
for (int scii = 0; scii < atom->siclusters[sci].nclusters; scii++) {
//const int ci = atom->siclusters[sci].iclusters[scii];
const int ci = atom->icluster_idx[SCLUSTER_SIZE * sci + ci];
int *neighs = &neighbor->neighbors[sci * neighbor->maxneighs];
int numneighs = neighbor->numneigh[sci];
int k = 0;
// Remove dummy clusters if necessary
if(CLUSTER_N < VECTOR_WIDTH) {
while(neighs[numneighs - 1] == atom->dummy_cj) {
numneighs--;
}
}
while(k < numneighs) {
int cj = neighs[k];
if(atomDistanceInRange(atom, ci, cj, cutsq)) {
k++;
} else {
numneighs--;
neighs[k] = neighs[numneighs];
}
}
// Readd dummy clusters if necessary
if(CLUSTER_N < VECTOR_WIDTH) {
while(numneighs % (VECTOR_WIDTH / CLUSTER_N)) {
neighs[numneighs++] = atom->dummy_cj; // Last cluster is always a dummy cluster
}
}
neighbor->numneigh[sci] = numneighs;
}
}
DEBUG_MESSAGE("pruneNeighbor end\n");
}
#endif //USE_SUPER_CLUSTERS
/* internal subroutines */ /* internal subroutines */
MD_FLOAT bindist(int i, int j) { MD_FLOAT bindist(int i, int j) {
MD_FLOAT delx, dely, delz; MD_FLOAT delx, dely, delz;
@ -529,6 +772,36 @@ void sortAtomsByZCoord(Atom *atom) {
DEBUG_MESSAGE("sortAtomsByZCoord end\n"); DEBUG_MESSAGE("sortAtomsByZCoord end\n");
} }
#ifdef USE_SUPER_CLUSTERS
// TODO: Use pigeonhole sorting
void sortAtomsByCoord(Atom *atom, int dim, int bin, int start_index, int end_index) {
//DEBUG_MESSAGE("sortAtomsByCoord start\n");
int *bin_ptr = &bins[bin * atoms_per_bin];
for(int ac_i = start_index; ac_i <= end_index; ac_i++) {
int i = bin_ptr[ac_i];
int min_ac = ac_i;
int min_idx = i;
MD_FLOAT min_coord = DIM_COORD(dim, i);
for(int ac_j = ac_i + 1; ac_j <= end_index; ac_j++) {
int j = bin_ptr[ac_j];
MD_FLOAT coordj = DIM_COORD(dim, j);
if(coordj < min_coord) {
min_ac = ac_j;
min_idx = j;
min_coord = coordj;
}
}
bin_ptr[ac_i] = min_idx;
bin_ptr[min_ac] = i;
}
//DEBUG_MESSAGE("sortAtomsByCoord end\n");
}
#endif //USE_SUPER_CLUSTERS
void buildClusters(Atom *atom) { void buildClusters(Atom *atom) {
DEBUG_MESSAGE("buildClusters start\n"); DEBUG_MESSAGE("buildClusters start\n");
atom->Nclusters_local = 0; atom->Nclusters_local = 0;
@ -605,6 +878,172 @@ void buildClusters(Atom *atom) {
DEBUG_MESSAGE("buildClusters end\n"); DEBUG_MESSAGE("buildClusters end\n");
} }
#ifdef USE_SUPER_CLUSTERS
void buildClustersGPU(Atom *atom) {
DEBUG_MESSAGE("buildClustersGPU start\n");
atom->Nclusters_local = 0;
/* bin local atoms */
binAtoms(atom);
for(int bin = 0; bin < mbins; bin++) {
int c = bincount[bin];
sortAtomsByCoord(atom, ZZ, bin, 0, c - 1);
int ac = 0;
int nclusters = ((c + CLUSTER_M - 1) / CLUSTER_M);
if(CLUSTER_N > CLUSTER_M && nclusters % 2) { nclusters++; }
int n_super_clusters_xy = nclusters / (SCLUSTER_SIZE_X * SCLUSTER_SIZE_Y);
if (nclusters % (SCLUSTER_SIZE_X * SCLUSTER_SIZE_Y)) n_super_clusters_xy++;
int n_super_clusters = n_super_clusters_xy / SCLUSTER_SIZE_Z;
if (n_super_clusters_xy % SCLUSTER_SIZE_Z) n_super_clusters++;
//printf("%d\t%d\t%d\t%d\r\n", nclusters, n_super_clusters_xy, n_super_clusters, (SCLUSTER_SIZE_X * SCLUSTER_SIZE_Y * SCLUSTER_SIZE_Z)*n_super_clusters);
int cl_count = 0;
for (int scl = 0; scl < n_super_clusters; scl++) {
const int sci = atom->Nsclusters_local;
if(sci >= atom->Nsclusters_max) {
growSuperClusters(atom);
}
if (cl_count >= nclusters) break;
int scl_offset = scl * SCLUSTER_SIZE * CLUSTER_M;
MD_FLOAT sc_bbminx = INFINITY, sc_bbmaxx = -INFINITY;
MD_FLOAT sc_bbminy = INFINITY, sc_bbmaxy = -INFINITY;
MD_FLOAT sc_bbminz = INFINITY, sc_bbmaxz = -INFINITY;
for (int scl_z = 0; scl_z < SCLUSTER_SIZE_Z; scl_z++) {
if (cl_count >= nclusters) break;
const int atom_scl_z_offset = scl_offset + scl_z * SCLUSTER_SIZE_Y * SCLUSTER_SIZE_X * CLUSTER_M;
const int atom_scl_z_end_idx = MIN(atom_scl_z_offset +
SCLUSTER_SIZE_Y * SCLUSTER_SIZE_X * CLUSTER_M - 1, c - 1);
sortAtomsByCoord(atom, YY, bin, atom_scl_z_offset, atom_scl_z_end_idx);
//printf("%d\t%d\t%d\t%d\t%d\t%d\r\n", nclusters, scl, scl_z, atom_scl_z_offset, atom_scl_z_end_idx, c);
for (int scl_y = 0; scl_y < SCLUSTER_SIZE_Y; scl_y++) {
if (cl_count >= nclusters) break;
const int atom_scl_y_offset = scl_offset +
scl_z * SCLUSTER_SIZE_Y * SCLUSTER_SIZE_X * CLUSTER_M +
scl_y * SCLUSTER_SIZE_Y * CLUSTER_M;
const int atom_scl_y_end_idx = MIN(atom_scl_y_offset +
SCLUSTER_SIZE_X * CLUSTER_M - 1, c - 1);
//printf("%d\t%d\t%d\t%d\t%d\t%d\t%d\r\n", nclusters, scl, scl_z, scl_y, atom_scl_y_offset, atom_scl_y_end_idx, c);
sortAtomsByCoord(atom, XX, bin, atom_scl_y_offset, atom_scl_y_end_idx);
for (int scl_x = 0; scl_x < SCLUSTER_SIZE_X; scl_x++) {
if (cl_count >= nclusters) break;
cl_count++;
const int cluster_sup_idx = scl_z * SCLUSTER_SIZE_Z * SCLUSTER_SIZE_Y +
scl_y * SCLUSTER_SIZE_X + scl_x;
const int ci = atom->Nclusters_local;
if(ci >= atom->Nclusters_max) {
growClusters(atom);
}
int ci_sca_base = CI_SCALAR_BASE_INDEX(ci);
int ci_vec_base = CI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base];
MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base];
int *ci_type = &atom->cl_type[ci_sca_base];
MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY;
MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY;
MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY;
atom->iclusters[ci].natoms = 0;
for(int cii = 0; cii < CLUSTER_M; cii++) {
if(ac < c) {
int i = bins[bin * atoms_per_bin + ac];
MD_FLOAT xtmp = atom_x(i);
MD_FLOAT ytmp = atom_y(i);
MD_FLOAT ztmp = atom_z(i);
ci_x[CL_X_OFFSET + cii] = xtmp;
ci_x[CL_Y_OFFSET + cii] = ytmp;
ci_x[CL_Z_OFFSET + cii] = ztmp;
ci_v[CL_X_OFFSET + cii] = atom->vx[i];
ci_v[CL_Y_OFFSET + cii] = atom->vy[i];
ci_v[CL_Z_OFFSET + cii] = atom->vz[i];
// TODO: To create the bounding boxes faster, we can use SIMD operations
if(bbminx > xtmp) { bbminx = xtmp; }
if(bbmaxx < xtmp) { bbmaxx = xtmp; }
if(bbminy > ytmp) { bbminy = ytmp; }
if(bbmaxy < ytmp) { bbmaxy = ytmp; }
if(bbminz > ztmp) { bbminz = ztmp; }
if(bbmaxz < ztmp) { bbmaxz = ztmp; }
ci_type[cii] = atom->type[i];
atom->iclusters[ci].natoms++;
} else {
ci_x[CL_X_OFFSET + cii] = INFINITY;
ci_x[CL_Y_OFFSET + cii] = INFINITY;
ci_x[CL_Z_OFFSET + cii] = INFINITY;
}
ac++;
}
atom->icluster_bin[ci] = bin;
atom->iclusters[ci].bbminx = bbminx;
atom->iclusters[ci].bbmaxx = bbmaxx;
atom->iclusters[ci].bbminy = bbminy;
atom->iclusters[ci].bbmaxy = bbmaxy;
atom->iclusters[ci].bbminz = bbminz;
atom->iclusters[ci].bbmaxz = bbmaxz;
atom->Nclusters_local++;
// TODO: To create the bounding boxes faster, we can use SIMD operations
if(sc_bbminx > bbminx) { sc_bbminx = bbminx; }
if(sc_bbmaxx < bbmaxx) { sc_bbmaxx = bbmaxx; }
if(sc_bbminy > bbminy) { sc_bbminy = bbminy; }
if(sc_bbmaxy < bbmaxy) { sc_bbmaxy = bbmaxy; }
if(sc_bbminz > bbminz) { sc_bbminz = bbminz; }
if(sc_bbmaxz < bbmaxz) { sc_bbmaxz = bbmaxz; }
atom->siclusters[sci].nclusters++;
atom->icluster_idx[SCLUSTER_SIZE * sci + cluster_sup_idx] = ci;
//atom->siclusters[sci].iclusters[cluster_sup_idx] = ci;
}
}
}
atom->sicluster_bin[sci] = bin;
atom->siclusters[sci].bbminx = sc_bbminx;
atom->siclusters[sci].bbmaxx = sc_bbmaxx;
atom->siclusters[sci].bbminy = sc_bbminy;
atom->siclusters[sci].bbmaxy = sc_bbmaxy;
atom->siclusters[sci].bbminz = sc_bbminz;
atom->siclusters[sci].bbmaxz = sc_bbmaxz;
atom->Nsclusters_local++;
}
//printf("%d\t%d\r\n", (SCLUSTER_SIZE_X * SCLUSTER_SIZE_Y * SCLUSTER_SIZE_Z)*n_super_clusters, count);
}
DEBUG_MESSAGE("buildClustersGPU end\n");
}
#endif //USE_SUPER_CLUSTERS
void defineJClusters(Atom *atom) { void defineJClusters(Atom *atom) {
DEBUG_MESSAGE("defineJClusters start\n"); DEBUG_MESSAGE("defineJClusters start\n");

277
gromacs/utils.c Normal file
View File

@ -0,0 +1,277 @@
/*
* Temporal functions for debugging, remove before proceeding with pull request
*/
#include <stdio.h>
#include <utils.h>
extern void alignDataToSuperclusters(Atom *atom);
extern void alignDataFromSuperclusters(Atom *atom);
#ifdef USE_SUPER_CLUSTERS
/*
void verifyClusters(Atom *atom) {
unsigned int count = 0;
for (int i = 0; i < atom->Nsclusters_local; i++) {
for (int j = 0; j < atom->siclusters[i].nclusters; j++) {
for(int cii = 0; cii < CLUSTER_M; cii++, count++);
}
}
MD_FLOAT *x = malloc(count * sizeof(MD_FLOAT));
MD_FLOAT *y = malloc(count * sizeof(MD_FLOAT));
MD_FLOAT *z = malloc(count * sizeof(MD_FLOAT));
count = 0;
unsigned int diffs = 0;
printf("######### %d #########\r\n", atom->Nsclusters_local);
for (int i = 0; i < atom->Nsclusters_local; i++) {
printf("######### %d\t #########\r\n", atom->siclusters[i].nclusters);
for (int j = 0; j < atom->siclusters[i].nclusters; j++) {
//printf("%d\t", atom.siclusters[i].iclusters[j]);
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[i].iclusters[j])];
if (atom->iclusters[atom->siclusters[i].iclusters[j]].bbminx < atom->siclusters[i].bbminx ||
atom->iclusters[atom->siclusters[i].iclusters[j]].bbmaxx > atom->siclusters[i].bbmaxx ||
atom->iclusters[atom->siclusters[i].iclusters[j]].bbminy < atom->siclusters[i].bbminy ||
atom->iclusters[atom->siclusters[i].iclusters[j]].bbmaxy > atom->siclusters[i].bbmaxy ||
atom->iclusters[atom->siclusters[i].iclusters[j]].bbminz < atom->siclusters[i].bbminz ||
atom->iclusters[atom->siclusters[i].iclusters[j]].bbmaxz > atom->siclusters[i].bbmaxz) diffs++;
for(int cii = 0; cii < CLUSTER_M; cii++, count++) {
x[count] = ci_x[CL_X_OFFSET + cii];
y[count] = ci_x[CL_Y_OFFSET + cii];
z[count] = ci_x[CL_Z_OFFSET + cii];
//printf("x: %f\ty: %f\tz: %f\r\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
}
}
printf("######### \t #########\r\n");
}
printf("######### Diffs: %d\t #########\r\n", diffs);
printf("\r\n");
count = 0;
diffs = 0;
for (int i = 0; i < atom->Nclusters_local; i++) {
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(i)];
for(int cii = 0; cii < CLUSTER_M; cii++, count++) {
if (ci_x[CL_X_OFFSET + cii] != x[count] ||
ci_x[CL_Y_OFFSET + cii] != y[count] ||
ci_x[CL_Z_OFFSET + cii] != z[count]) diffs++;
}
}
printf("######### Diffs: %d\t #########\r\n", diffs);
}
*/
void verifyLayout(Atom *atom) {
printf("verifyLayout start\r\n");
/*
unsigned int count = 0;
for (int i = 0; i < atom->Nsclusters_local; i++) {
for (int j = 0; j < atom->siclusters[i].nclusters; j++, count++);
}
MD_FLOAT *scl_x = malloc(atom->Nsclusters_local * SCLUSTER_SIZE * 3 * CLUSTER_M * sizeof(MD_FLOAT));
for (int sci = 0; sci < atom->Nsclusters_local; sci++) {
const unsigned int scl_offset = sci * SCLUSTER_SIZE * 3 * CLUSTER_M;
for (int ci = 0, scci = scl_offset; ci < atom->siclusters[sci].nclusters; ci++, scci += CLUSTER_M) {
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(atom->siclusters[sci].iclusters[ci])];
const unsigned int atom_offset = scci;
/*
for(int cii = 0, scii = atom_offset; cii < CLUSTER_M; cii++, scii += 3) {
scl_x[CL_X_OFFSET + scii] = ci_x[CL_X_OFFSET + cii];
scl_x[CL_Y_OFFSET + scii] = ci_x[CL_Y_OFFSET + cii];
scl_x[CL_Z_OFFSET + scii] = ci_x[CL_Z_OFFSET + cii];
//printf("x: %f\ty: %f\tz: %f\r\n", ci_x[CL_X_OFFSET + cii], ci_x[CL_Y_OFFSET + cii], ci_x[CL_Z_OFFSET + cii]);
}
memcpy(&scl_x[atom_offset], &ci_x[0], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&scl_x[atom_offset + SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
memcpy(&scl_x[atom_offset + 2 * SCLUSTER_SIZE * CLUSTER_M], &ci_x[0 + 2 * CLUSTER_M], CLUSTER_M * sizeof(MD_FLOAT));
}
}
*/
//alignDataToSuperclusters(atom);
//for (int sci = 0; sci < 2; sci++) {
for (int sci = 4; sci < 6; sci++) {
const unsigned int scl_offset = sci * SCLUSTER_SIZE;
MD_FLOAT *sci_x = &atom->scl_f[SCI_VECTOR_BASE_INDEX(sci)];
for (int cii = 0; cii < SCLUSTER_M; ++cii) {
const unsigned int cl_idx = cii / CLUSTER_M;
const unsigned int ciii = cii % CLUSTER_M;
/*
printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[cii],
sci_x[cii + SCLUSTER_SIZE * CLUSTER_M], sci_x[cii + 2 * SCLUSTER_SIZE * CLUSTER_M]);
*/
printf("%d\t%d\t%f\t%f\t%f\r\n", atom->icluster_idx[SCLUSTER_SIZE * sci + cl_idx], cl_idx, sci_x[SCL_CL_X_OFFSET(cl_idx) + ciii],
sci_x[SCL_CL_Y_OFFSET(cl_idx) + ciii], sci_x[SCL_CL_Z_OFFSET(cl_idx) + ciii]);
}
/*
//for (int cii = 0; cii < SCLUSTER_M; ++cii) {
for (int cii = 0; cii < SCLUSTER_M; ++cii) {
const unsigned int cl_idx = cii / CLUSTER_M;
const unsigned int ciii = cii % CLUSTER_M;
/*
printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[SCL_X_OFFSET(cl_idx) + cii],
sci_x[SCL_Y_OFFSET(cl_idx) + cii], sci_x[SCL_Z_OFFSET(cl_idx) + cii]);
*/
/*
printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[SCL_X_OFFSET(cl_idx) + ciii],
sci_x[SCL_Y_OFFSET(cl_idx) + ciii], sci_x[SCL_Z_OFFSET(cl_idx) + ciii]);
}
*/
/*
for (int scii = scl_offset; scii < scl_offset + SCLUSTER_SIZE; scii++) {
for (int cii = 0; cii < CLUSTER_M; ++cii) {
printf("%f\t%f\t%f\r\n", sci_x[SCL_X_OFFSET(scii) + cii],
sci_x[SCL_Y_OFFSET(scii) + cii], sci_x[SCL_Z_OFFSET(scii) + cii]);
}
/*
const unsigned int cl_offset = scii * 3 * CLUSTER_M;
//MD_FLOAT *sci_x = &scl_x[CI_VECTOR_BASE_INDEX(scii)];
for (int cii = cl_offset; cii < cl_offset + CLUSTER_M; ++cii) {
printf("%f\t%f\t%f\r\n", sci_x[CL_X_OFFSET + cii],
sci_x[CL_Y_OFFSET + cii], sci_x[CL_Z_OFFSET + cii]);
}
*/
/*
for (int cii = cl_offset; cii < cl_offset + CLUSTER_M; ++cii) {
printf("%f\t%f\t%f\r\n", scl_x[CL_X_OFFSET + cii],
scl_x[CL_Y_OFFSET + cii], scl_x[CL_Z_OFFSET + cii]);
}
*/
//}
printf("##########\t##########\r\n");
}
printf("\r\n");
//for (int ci = 0; ci < 16; ci++) {
for (int ci = 35; ci < 37; ci++) {
printf("$$$$$$$$$$\t%d\t%d\t$$$$$$$$$$\r\n", ci, atom->icluster_bin[ci]);
MD_FLOAT *ci_x = &atom->cl_f[CI_VECTOR_BASE_INDEX(ci)];
//for(int cii = 0; cii < CLUSTER_M; cii++, count++) {
for(int cii = 0; cii < CLUSTER_M; cii++) {
printf("%f\t%f\t%f\r\n", ci_x[CL_X_OFFSET + cii],
ci_x[CL_Y_OFFSET + cii],
ci_x[CL_Z_OFFSET + cii]);
}
printf("##########\t##########\r\n");
}
printf("verifyLayout end\r\n");
/*
for (int i = 0; i < atom->Nclusters_local; i++) {
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(i)];
for(int cii = 0; cii < CLUSTER_M; cii++, count++) {
if (ci_x[CL_X_OFFSET + cii] != x[count] ||
ci_x[CL_Y_OFFSET + cii] != y[count] ||
ci_x[CL_Z_OFFSET + cii] != z[count]) diffs++;
}
}
*/
}
void checkAlignment(Atom *atom) {
alignDataToSuperclusters(atom);
for (int sci = 4; sci < 6; sci++) {
MD_FLOAT *sci_x = &atom->scl_x[SCI_VECTOR_BASE_INDEX(sci)];
for (int cii = 0; cii < SCLUSTER_M; ++cii) {
const unsigned int cl_idx = cii / CLUSTER_M;
const unsigned int ciii = cii % CLUSTER_M;
printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[SCL_CL_X_OFFSET(cl_idx) + ciii],
sci_x[SCL_CL_Y_OFFSET(cl_idx) + ciii], sci_x[SCL_CL_Z_OFFSET(cl_idx) + ciii]);
}
}
for (int ci = 35; ci < 37; ci++) {
printf("$$$$$$$$$$\t%d\t%d\t$$$$$$$$$$\r\n", ci, atom->icluster_bin[ci]);
MD_FLOAT *ci_x = &atom->cl_x[CI_VECTOR_BASE_INDEX(ci)];
for(int cii = 0; cii < CLUSTER_M; cii++) {
printf("%f\t%f\t%f\r\n", ci_x[CL_X_OFFSET + cii],
ci_x[CL_Y_OFFSET + cii],
ci_x[CL_Z_OFFSET + cii]);
}
printf("##########\t##########\r\n");
}
}
void showSuperclusters(Atom *atom) {
for (int sci = 4; sci < 6; sci++) {
MD_FLOAT *sci_x = &atom->scl_x[SCI_VECTOR_BASE_INDEX(sci)];
for (int cii = 0; cii < SCLUSTER_M; ++cii) {
const unsigned int cl_idx = cii / CLUSTER_M;
const unsigned int ciii = cii % CLUSTER_M;
printf("%d\t%f\t%f\t%f\r\n", cl_idx, sci_x[SCL_CL_X_OFFSET(cl_idx) + ciii],
sci_x[SCL_CL_Y_OFFSET(cl_idx) + ciii], sci_x[SCL_CL_Z_OFFSET(cl_idx) + ciii]);
}
}
}
#endif //USE_SUPER_CLUSTERS

View File

@ -15,8 +15,61 @@ void write_data_to_vtk_file(const char *filename, Atom* atom, int timestep) {
write_ghost_atoms_to_vtk_file(filename, atom, timestep); write_ghost_atoms_to_vtk_file(filename, atom, timestep);
write_local_cluster_edges_to_vtk_file(filename, atom, timestep); write_local_cluster_edges_to_vtk_file(filename, atom, timestep);
write_ghost_cluster_edges_to_vtk_file(filename, atom, timestep); write_ghost_cluster_edges_to_vtk_file(filename, atom, timestep);
#ifdef USE_SUPER_CLUSTERS
write_super_clusters_to_vtk_file(filename, atom, timestep);
#endif //#ifdef USE_SUPER_CLUSTERS
} }
#ifdef USE_SUPER_CLUSTERS
int write_super_clusters_to_vtk_file(const char* filename, Atom* atom, int timestep) {
char timestep_filename[128];
snprintf(timestep_filename, sizeof timestep_filename, "%s_sup_%d.vtk", filename, timestep);
FILE* fp = fopen(timestep_filename, "wb");
if(fp == NULL) {
fprintf(stderr, "Could not open VTK file for writing!\n");
return -1;
}
fprintf(fp, "# vtk DataFile Version 2.0\n");
fprintf(fp, "Particle data\n");
fprintf(fp, "ASCII\n");
fprintf(fp, "DATASET UNSTRUCTURED_GRID\n");
fprintf(fp, "POINTS %d double\n", atom->Nsclusters_local * SCLUSTER_M);
for(int ci = 0; ci < atom->Nsclusters_local; ++ci) {
int factor = (rand() % 1000) + 1;
//double factor = ci * 10;
int ci_vec_base = SCI_VECTOR_BASE_INDEX(ci);
MD_FLOAT *ci_x = &atom->scl_x[ci_vec_base];
for(int cii = 0; cii < SCLUSTER_M; ++cii) {
fprintf(fp, "%.4f %.4f %.4f\n", ci_x[SCL_X_OFFSET + cii] * factor, ci_x[SCL_Y_OFFSET + cii] * factor, ci_x[SCL_Z_OFFSET + cii] * factor);
}
}
fprintf(fp, "\n\n");
fprintf(fp, "CELLS %d %d\n", atom->Nlocal, atom->Nlocal * 2);
for(int i = 0; i < atom->Nlocal; ++i) {
fprintf(fp, "1 %d\n", i);
}
fprintf(fp, "\n\n");
fprintf(fp, "CELL_TYPES %d\n", atom->Nlocal);
for(int i = 0; i < atom->Nlocal; ++i) {
fprintf(fp, "1\n");
}
fprintf(fp, "\n\n");
fprintf(fp, "POINT_DATA %d\n", atom->Nlocal);
fprintf(fp, "SCALARS mass double\n");
fprintf(fp, "LOOKUP_TABLE default\n");
for(int i = 0; i < atom->Nlocal; i++) {
fprintf(fp, "1.0\n");
}
fprintf(fp, "\n\n");
fclose(fp);
return 0;
}
#endif //USE_SUPER_CLUSTERS
int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) { int write_local_atoms_to_vtk_file(const char* filename, Atom* atom, int timestep) {
char timestep_filename[128]; char timestep_filename[128];
snprintf(timestep_filename, sizeof timestep_filename, "%s_local_%d.vtk", filename, timestep); snprintf(timestep_filename, sizeof timestep_filename, "%s_local_%d.vtk", filename, timestep);

View File

@ -8,7 +8,8 @@ ANSI_CFLAGS += -Wextra
# #
# A100 + Native # A100 + Native
CFLAGS = -O3 -arch=sm_80 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp #CFLAGS = -O3 -arch=sm_80 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
CFLAGS = -O3 -arch=compute_61 -code=sm_61,sm_80,sm_86 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
# A40 + Native # A40 + Native
#CFLAGS = -O3 -arch=sm_86 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp #CFLAGS = -O3 -arch=sm_86 -march=native -ffast-math -funroll-loops --forward-unknown-to-host-compiler # -fopenmp
# Cascade Lake # Cascade Lake