diff --git a/gromacs/cuda/force_lj.cu b/gromacs/cuda/force_lj.cu index aa02fdd..2bf6962 100644 --- a/gromacs/cuda/force_lj.cu +++ b/gromacs/cuda/force_lj.cu @@ -121,7 +121,7 @@ void copyDataToCUDADevice(Atom *atom) { memcpyToGPU(cuda_PBCz, atom->PBCz, atom->Nclusters_ghost * sizeof(int)); #ifdef USE_SUPER_CLUSTERS - alignDataToSuperclusters(atom); + //alignDataToSuperclusters(atom); if (cuda_max_scl < atom->Nsclusters_max) { cuda_assert("cudaDeviceFree", cudaFree(cuda_scl_x)); @@ -158,7 +158,7 @@ void copyDataFromCUDADevice(Atom *atom) { memcpyFromGPU(atom->scl_v, cuda_scl_v, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT)); memcpyFromGPU(atom->scl_f, cuda_scl_f, atom->Nsclusters_max * SCLUSTER_M * 3 * sizeof(MD_FLOAT)); - alignDataFromSuperclusters(atom); + //alignDataFromSuperclusters(atom); #endif //USE_SUPER_CLUSTERS DEBUG_MESSAGE("copyDataFromCUDADevice stop\r\n"); diff --git a/gromacs/includes/atom.h b/gromacs/includes/atom.h index 281548e..b1a4c19 100644 --- a/gromacs/includes/atom.h +++ b/gromacs/includes/atom.h @@ -74,6 +74,7 @@ # define CI_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b)) # define CJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * (b)) #ifdef USE_SUPER_CLUSTERS +# define CJ1_FROM_SCI(a) (a) # define SCI_BASE_INDEX(a,b) ((a) * CLUSTER_N * SCLUSTER_SIZE * (b)) # define SCJ_BASE_INDEX(a,b) ((a) * CLUSTER_N * SCLUSTER_SIZE * (b)) #endif //USE_SUPER_CLUSTERS diff --git a/gromacs/main.c b/gromacs/main.c index 8a24baa..9851c9c 100644 --- a/gromacs/main.c +++ b/gromacs/main.c @@ -78,8 +78,8 @@ double setup(Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Stats * #endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS) defineJClusters(atom); #if defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS) - //setupPbcGPU(atom, param); - setupPbc(atom, param); + setupPbcGPU(atom, param); + //setupPbc(atom, param); #else setupPbc(atom, param); #endif //defined(CUDA_TARGET) && defined(USE_SUPER_CLUSTERS) diff --git a/gromacs/neighbor.c b/gromacs/neighbor.c index 3124b8f..e1a2698 100644 --- a/gromacs/neighbor.c +++ b/gromacs/neighbor.c @@ -189,6 +189,30 @@ int atomDistanceInRange(Atom *atom, int ci, int cj, MD_FLOAT rsq) { return 0; } +int atomDistanceInRangeGPU(Atom *atom, int sci, int cj, MD_FLOAT rsq) { + for (int ci = 0; ci < atom->siclusters[sci].nclusters; ci++) { + const int icluster_idx = atom->icluster_idx[SCLUSTER_SIZE * sci + ci]; + + int ci_vec_base = CI_VECTOR_BASE_INDEX(icluster_idx); + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; + MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; + + for(int cii = 0; cii < atom->iclusters[icluster_idx].natoms; cii++) { + for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) { + MD_FLOAT delx = ci_x[CL_X_OFFSET + cii] - cj_x[CL_X_OFFSET + cjj]; + MD_FLOAT dely = ci_x[CL_Y_OFFSET + cii] - cj_x[CL_Y_OFFSET + cjj]; + MD_FLOAT delz = ci_x[CL_Z_OFFSET + cii] - cj_x[CL_Z_OFFSET + cjj]; + if(delx * delx + dely * dely + delz * delz < rsq) { + return 1; + } + } + } + } + + return 0; +} + void buildNeighbor(Atom *atom, Neighbor *neighbor) { DEBUG_MESSAGE("buildNeighbor start\n"); @@ -375,9 +399,8 @@ void buildNeighborGPU(Atom *atom, Neighbor *neighbor) { DEBUG_MESSAGE("buildNeighborGPU start\n"); /* extend atom arrays if necessary */ - if(atom->Nclusters_local > nmax) { - nmax = atom->Nclusters_local; - //nmax = atom->Nsclusters_local; + if(atom->Nsclusters_local > nmax) { + nmax = atom->Nsclusters_local; if(neighbor->numneigh) free(neighbor->numneigh); if(neighbor->neighbors) free(neighbor->neighbors); neighbor->numneigh = (int*) malloc(nmax * sizeof(int)); @@ -396,114 +419,116 @@ void buildNeighborGPU(Atom *atom, Neighbor *neighbor) { resize = 0; for (int sci = 0; sci < atom->Nsclusters_local; sci++) { - for (int scii = 0; scii < atom->siclusters[sci].nclusters; scii++) { - //for(int ci = 0; ci < atom->Nclusters_local; ci++) { - //const int ci = atom->siclusters[sci].iclusters[scii]; - const int ci = atom->icluster_idx[SCLUSTER_SIZE * sci + scii]; - int ci_cj1 = CJ1_FROM_CI(ci); - int *neighptr = &(neighbor->neighbors[ci * neighbor->maxneighs]); - //int *neighptr = &(neighbor->neighbors[sci * neighbor->maxneighs]); - int n = 0; - int ibin = atom->icluster_bin[ci]; - MD_FLOAT ibb_xmin = atom->iclusters[ci].bbminx; - MD_FLOAT ibb_xmax = atom->iclusters[ci].bbmaxx; - MD_FLOAT ibb_ymin = atom->iclusters[ci].bbminy; - MD_FLOAT ibb_ymax = atom->iclusters[ci].bbmaxy; - MD_FLOAT ibb_zmin = atom->iclusters[ci].bbminz; - MD_FLOAT ibb_zmax = atom->iclusters[ci].bbmaxz; + int ci_cj1 = CJ1_FROM_SCI(sci); + int *neighptr = &(neighbor->neighbors[sci * neighbor->maxneighs]); + int n = 0; + int ibin = atom->sicluster_bin[sci]; + MD_FLOAT ibb_xmin = atom->siclusters[sci].bbminx; + MD_FLOAT ibb_xmax = atom->siclusters[sci].bbmaxx; + MD_FLOAT ibb_ymin = atom->siclusters[sci].bbminy; + MD_FLOAT ibb_ymax = atom->siclusters[sci].bbmaxy; + MD_FLOAT ibb_zmin = atom->siclusters[sci].bbminz; + MD_FLOAT ibb_zmax = atom->siclusters[sci].bbmaxz; - for(int k = 0; k < nstencil; k++) { - int jbin = ibin + stencil[k]; - int *loc_bin = &bin_clusters[jbin * clusters_per_bin]; - int cj, m = -1; - MD_FLOAT jbb_xmin, jbb_xmax, jbb_ymin, jbb_ymax, jbb_zmin, jbb_zmax; - const int c = bin_nclusters[jbin]; + for(int k = 0; k < nstencil; k++) { + int jbin = ibin + stencil[k]; + int *loc_bin = &bin_clusters[jbin * clusters_per_bin]; + int cj, m = -1; + MD_FLOAT jbb_xmin, jbb_xmax, jbb_ymin, jbb_ymax, jbb_zmin, jbb_zmax; + const int c = bin_nclusters[jbin]; - if(c > 0) { - MD_FLOAT dl, dh, dm, dm0, d_bb_sq; + if(c > 0) { + MD_FLOAT dl, dh, dm, dm0, d_bb_sq; - do { - m++; - cj = loc_bin[m]; - if(neighbor->half_neigh && ci_cj1 > cj) { - continue; - } - jbb_zmin = atom->jclusters[cj].bbminz; - jbb_zmax = atom->jclusters[cj].bbmaxz; + do { + m++; + cj = loc_bin[m]; + if(neighbor->half_neigh && ci_cj1 > cj) { + continue; + } + jbb_zmin = atom->jclusters[cj].bbminz; + jbb_zmax = atom->jclusters[cj].bbmaxz; + dl = ibb_zmin - jbb_zmax; + dh = jbb_zmin - ibb_zmax; + dm = MAX(dl, dh); + dm0 = MAX(dm, 0.0); + d_bb_sq = dm0 * dm0; + } while(m + 1 < c && d_bb_sq > cutneighsq); + + jbb_xmin = atom->jclusters[cj].bbminx; + jbb_xmax = atom->jclusters[cj].bbmaxx; + jbb_ymin = atom->jclusters[cj].bbminy; + jbb_ymax = atom->jclusters[cj].bbmaxy; + + while(m < c) { + if(!neighbor->half_neigh || ci_cj1 <= cj) { dl = ibb_zmin - jbb_zmax; dh = jbb_zmin - ibb_zmax; dm = MAX(dl, dh); dm0 = MAX(dm, 0.0); d_bb_sq = dm0 * dm0; - } while(m + 1 < c && d_bb_sq > cutneighsq); - jbb_xmin = atom->jclusters[cj].bbminx; - jbb_xmax = atom->jclusters[cj].bbmaxx; - jbb_ymin = atom->jclusters[cj].bbminy; - jbb_ymax = atom->jclusters[cj].bbmaxy; + /*if(d_bb_sq > cutneighsq) { + break; + }*/ - while(m < c) { - if(!neighbor->half_neigh || ci_cj1 <= cj) { - dl = ibb_zmin - jbb_zmax; - dh = jbb_zmin - ibb_zmax; - dm = MAX(dl, dh); - dm0 = MAX(dm, 0.0); - d_bb_sq = dm0 * dm0; + dl = ibb_ymin - jbb_ymax; + dh = jbb_ymin - ibb_ymax; + dm = MAX(dl, dh); + dm0 = MAX(dm, 0.0); + d_bb_sq += dm0 * dm0; - /*if(d_bb_sq > cutneighsq) { - break; - }*/ + dl = ibb_xmin - jbb_xmax; + dh = jbb_xmin - ibb_xmax; + dm = MAX(dl, dh); + dm0 = MAX(dm, 0.0); + d_bb_sq += dm0 * dm0; - dl = ibb_ymin - jbb_ymax; - dh = jbb_ymin - ibb_ymax; - dm = MAX(dl, dh); - dm0 = MAX(dm, 0.0); - d_bb_sq += dm0 * dm0; - - dl = ibb_xmin - jbb_xmax; - dh = jbb_xmin - ibb_xmax; - dm = MAX(dl, dh); - dm0 = MAX(dm, 0.0); - d_bb_sq += dm0 * dm0; - - if(d_bb_sq < cutneighsq) { - if(d_bb_sq < rbb_sq || atomDistanceInRange(atom, ci, cj, cutneighsq)) { - neighptr[n++] = cj; - } + if(d_bb_sq < cutneighsq) { + if(d_bb_sq < rbb_sq || atomDistanceInRangeGPU(atom, sci, cj, cutneighsq)) { + neighptr[n++] = cj; } } + } - m++; - if(m < c) { - cj = loc_bin[m]; - jbb_xmin = atom->jclusters[cj].bbminx; - jbb_xmax = atom->jclusters[cj].bbmaxx; - jbb_ymin = atom->jclusters[cj].bbminy; - jbb_ymax = atom->jclusters[cj].bbmaxy; - jbb_zmin = atom->jclusters[cj].bbminz; - jbb_zmax = atom->jclusters[cj].bbmaxz; - } + m++; + if(m < c) { + cj = loc_bin[m]; + jbb_xmin = atom->jclusters[cj].bbminx; + jbb_xmax = atom->jclusters[cj].bbmaxx; + jbb_ymin = atom->jclusters[cj].bbminy; + jbb_ymax = atom->jclusters[cj].bbmaxy; + jbb_zmin = atom->jclusters[cj].bbminz; + jbb_zmax = atom->jclusters[cj].bbmaxz; } } } + } - // Fill neighbor list with dummy values to fit vector width - if(CLUSTER_N < VECTOR_WIDTH) { - while(n % (VECTOR_WIDTH / CLUSTER_N)) { - neighptr[n++] = atom->dummy_cj; // Last cluster is always a dummy cluster - } + // Fill neighbor list with dummy values to fit vector width + if(CLUSTER_N < VECTOR_WIDTH) { + while(n % (VECTOR_WIDTH / CLUSTER_N)) { + neighptr[n++] = atom->dummy_cj; // Last cluster is always a dummy cluster } + } - //neighbor->numneigh[sci] = n; - neighbor->numneigh[ci] = n; - if(n >= neighbor->maxneighs) { - resize = 1; + neighbor->numneigh[sci] = n; + if(n >= neighbor->maxneighs) { + resize = 1; - if(n >= new_maxneighs) { - new_maxneighs = n; - } + if(n >= new_maxneighs) { + new_maxneighs = n; } } + + + + + for (int scii = 0; scii < atom->siclusters[sci].nclusters; scii++) { + //for(int ci = 0; ci < atom->Nclusters_local; ci++) { + //const int ci = atom->siclusters[sci].iclusters[scii]; + + } } if(resize) { @@ -954,6 +979,12 @@ void buildClustersGPU(Atom *atom) { int ci_vec_base = CI_VECTOR_BASE_INDEX(ci); MD_FLOAT *ci_x = &atom->cl_x[ci_vec_base]; MD_FLOAT *ci_v = &atom->cl_v[ci_vec_base]; + + int sci_sca_base = SCI_SCALAR_BASE_INDEX(sci); + int sci_vec_base = SCI_VECTOR_BASE_INDEX(sci); + MD_FLOAT *sci_x = &atom->scl_x[sci_vec_base]; + MD_FLOAT *sci_v = &atom->scl_v[sci_vec_base]; + int *ci_type = &atom->cl_type[ci_sca_base]; MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY; MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY; @@ -974,6 +1005,14 @@ void buildClustersGPU(Atom *atom) { ci_v[CL_Y_OFFSET + cii] = atom->vy[i]; ci_v[CL_Z_OFFSET + cii] = atom->vz[i]; + sci_x[SCL_CL_X_OFFSET(atom->siclusters[sci].nclusters) + cii] = xtmp; + sci_x[SCL_CL_Y_OFFSET(atom->siclusters[sci].nclusters) + cii] = ytmp; + sci_x[SCL_CL_Z_OFFSET(atom->siclusters[sci].nclusters) + cii] = ztmp; + + sci_v[SCL_CL_X_OFFSET(atom->siclusters[sci].nclusters) + cii] = atom->vx[i]; + sci_v[SCL_CL_Y_OFFSET(atom->siclusters[sci].nclusters) + cii] = atom->vy[i]; + sci_v[SCL_CL_Z_OFFSET(atom->siclusters[sci].nclusters) + cii] = atom->vz[i]; + // TODO: To create the bounding boxes faster, we can use SIMD operations if(bbminx > xtmp) { bbminx = xtmp; } if(bbmaxx < xtmp) { bbmaxx = xtmp; } diff --git a/gromacs/pbc.c b/gromacs/pbc.c index ec4581e..02938ad 100644 --- a/gromacs/pbc.c +++ b/gromacs/pbc.c @@ -86,6 +86,98 @@ void cpuUpdatePbc(Atom *atom, Parameter *param, int firstUpdate) { DEBUG_MESSAGE("updatePbc end\n"); } +/* update coordinates of ghost atoms */ +/* uses mapping created in setupPbc */ +void gpuUpdatePbc(Atom *atom, Parameter *param, int firstUpdate) { + DEBUG_MESSAGE("gpuUpdatePbc start\n"); + int jfac = MAX(1, CLUSTER_N / CLUSTER_M); + int ncj = atom->Nclusters_local / jfac; + MD_FLOAT xprd = param->xprd; + MD_FLOAT yprd = param->yprd; + MD_FLOAT zprd = param->zprd; + + for(int cg = 0; cg < atom->Nclusters_ghost; cg++) { + const int cj = ncj + cg; + int cj_vec_base = CJ_VECTOR_BASE_INDEX(cj); + + int scj_vec_base = SCJ_VECTOR_BASE_INDEX(cj); + + int bmap_vec_base = CJ_VECTOR_BASE_INDEX(atom->border_map[cg]); + + int sbmap_vec_base = SCJ_VECTOR_BASE_INDEX(atom->border_map[cg]); + + MD_FLOAT *cj_x = &atom->cl_x[cj_vec_base]; + MD_FLOAT *bmap_x = &atom->cl_x[bmap_vec_base]; + + MD_FLOAT *scj_x = &atom->scl_x[scj_vec_base]; + MD_FLOAT *sbmap_x = &atom->scl_x[sbmap_vec_base]; + + MD_FLOAT bbminx = INFINITY, bbmaxx = -INFINITY; + MD_FLOAT bbminy = INFINITY, bbmaxy = -INFINITY; + MD_FLOAT bbminz = INFINITY, bbmaxz = -INFINITY; + + MD_FLOAT sbbminx = INFINITY, sbbmaxx = -INFINITY; + MD_FLOAT sbbminy = INFINITY, sbbmaxy = -INFINITY; + MD_FLOAT sbbminz = INFINITY, sbbmaxz = -INFINITY; + + for(int cjj = 0; cjj < atom->jclusters[cj].natoms; cjj++) { + MD_FLOAT xtmp = bmap_x[CL_X_OFFSET + cjj] + atom->PBCx[cg] * xprd; + MD_FLOAT ytmp = bmap_x[CL_Y_OFFSET + cjj] + atom->PBCy[cg] * yprd; + MD_FLOAT ztmp = bmap_x[CL_Z_OFFSET + cjj] + atom->PBCz[cg] * zprd; + + MD_FLOAT sxtmp = sbmap_x[CL_X_OFFSET + cjj] + atom->PBCx[cg] * xprd; + MD_FLOAT sytmp = sbmap_x[CL_Y_OFFSET + cjj] + atom->PBCy[cg] * yprd; + MD_FLOAT sztmp = sbmap_x[CL_Z_OFFSET + cjj] + atom->PBCz[cg] * zprd; + + cj_x[CL_X_OFFSET + cjj] = xtmp; + cj_x[CL_Y_OFFSET + cjj] = ytmp; + cj_x[CL_Z_OFFSET + cjj] = ztmp; + + scj_x[SCL_X_OFFSET + cjj] = sxtmp; + scj_x[SCL_Y_OFFSET + cjj] = sytmp; + scj_x[SCL_Z_OFFSET + cjj] = sztmp; + + if(firstUpdate) { + // TODO: To create the bounding boxes faster, we can use SIMD operations + if(bbminx > xtmp) { bbminx = xtmp; } + if(bbmaxx < xtmp) { bbmaxx = xtmp; } + if(bbminy > ytmp) { bbminy = ytmp; } + if(bbmaxy < ytmp) { bbmaxy = ytmp; } + if(bbminz > ztmp) { bbminz = ztmp; } + if(bbmaxz < ztmp) { bbmaxz = ztmp; } + + if(sbbminx > sxtmp) { sbbminx = sxtmp; } + if(sbbmaxx < sxtmp) { sbbmaxx = sxtmp; } + if(sbbminy > sytmp) { sbbminy = sytmp; } + if(sbbmaxy < sytmp) { sbbmaxy = sytmp; } + if(sbbminz > sztmp) { sbbminz = sztmp; } + if(sbbmaxz < sztmp) { sbbmaxz = sztmp; } + } + } + + if(firstUpdate) { + for(int cjj = atom->jclusters[cj].natoms; cjj < CLUSTER_N; cjj++) { + cj_x[CL_X_OFFSET + cjj] = INFINITY; + cj_x[CL_Y_OFFSET + cjj] = INFINITY; + cj_x[CL_Z_OFFSET + cjj] = INFINITY; + + scj_x[SCL_X_OFFSET + cjj] = INFINITY; + scj_x[SCL_Y_OFFSET + cjj] = INFINITY; + scj_x[SCL_Z_OFFSET + cjj] = INFINITY; + } + + atom->jclusters[cj].bbminx = bbminx; + atom->jclusters[cj].bbmaxx = bbmaxx; + atom->jclusters[cj].bbminy = bbminy; + atom->jclusters[cj].bbmaxy = bbmaxy; + atom->jclusters[cj].bbminz = bbminz; + atom->jclusters[cj].bbmaxz = bbmaxz; + } + } + + DEBUG_MESSAGE("gpuUpdatePbc end\n"); +} + /* relocate atoms that have left domain according * to periodic boundary conditions */ void updateAtomsPbc(Atom *atom, Parameter *param) { @@ -236,7 +328,8 @@ void setupPbcGPU(Atom *atom, Parameter *param) { MD_FLOAT yprd = param->yprd; MD_FLOAT zprd = param->zprd; MD_FLOAT Cutneigh = param->cutneigh; - int jfac = SCLUSTER_SIZE / CLUSTER_M; + //int jfac = MAX(1, CLUSTER_N / CLUSTER_M); + int jfac = SCLUSTER_M / CLUSTER_M; int ncj = atom->Nsclusters_local * jfac; int Nghost = -1; int Nghost_atoms = 0; @@ -245,6 +338,7 @@ void setupPbcGPU(Atom *atom, Parameter *param) { if(atom->jclusters[cj].natoms > 0) { if(atom->Nsclusters_local + (Nghost + (jfac - 1) + 7) / jfac >= atom->Nclusters_max) { growClusters(atom); + //growSuperClusters(atom); } if((Nghost + 7) * CLUSTER_M >= NmaxGhost) { @@ -293,6 +387,7 @@ void setupPbcGPU(Atom *atom, Parameter *param) { if(ncj + (Nghost + (jfac - 1) + 1) / jfac >= atom->Nclusters_max) { growClusters(atom); + //growSuperClusters(atom); } // Add dummy cluster at the end @@ -311,6 +406,6 @@ void setupPbcGPU(Atom *atom, Parameter *param) { atom->Nclusters = atom->Nclusters_local + Nghost + 1; // Update created ghost clusters positions - cudaUpdatePbc(atom, param, 1); + gpuUpdatePbc(atom, param, 1); DEBUG_MESSAGE("setupPbcGPU end\n"); }