/* * ======================================================================================= * * Author: Jan Eitzinger (je), jan.eitzinger@fau.de * Copyright (c) 2021 RRZE, University Erlangen-Nuremberg * * This file is part of MD-Bench. * * MD-Bench is free software: you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License along * with MD-Bench. If not, see . * ======================================================================================= */ #include #include #include #include #include #include //--- extern "C" { #include #include #include #include #include } static MD_FLOAT xprd, yprd, zprd; static MD_FLOAT bininvx, bininvy, bininvz; static int mbinxlo, mbinylo, mbinzlo; static int nbinx, nbiny, nbinz; static int mbinx, mbiny, mbinz; // n bins in x, y, z static int mbins; //total number of bins static int atoms_per_bin; // max atoms per bin static MD_FLOAT cutneighsq; // neighbor cutoff squared static int nmax; static int nstencil; // # of bins in stencil static int* stencil; // stencil list of bin offsets static int* c_stencil = NULL; static int* c_resize_needed = NULL; static int* c_new_maxneighs = NULL; static Binning c_binning { .bincount = NULL, .bins = NULL, .mbins = 0, .atoms_per_bin = 0 }; __device__ int coord2bin_device(MD_FLOAT xin, MD_FLOAT yin, MD_FLOAT zin, Neighbor_params np) { int ix, iy, iz; if(xin >= np.xprd) { ix = (int)((xin - np.xprd) * np.bininvx) + np.nbinx - np.mbinxlo; } else if(xin >= 0.0) { ix = (int)(xin * np.bininvx) - np.mbinxlo; } else { ix = (int)(xin * np.bininvx) - np.mbinxlo - 1; } if(yin >= np.yprd) { iy = (int)((yin - np.yprd) * np.bininvy) + np.nbiny - np.mbinylo; } else if(yin >= 0.0) { iy = (int)(yin * np.bininvy) - np.mbinylo; } else { iy = (int)(yin * np.bininvy) - np.mbinylo - 1; } if(zin >= np.zprd) { iz = (int)((zin - np.zprd) * np.bininvz) + np.nbinz - np.mbinzlo; } else if(zin >= 0.0) { iz = (int)(zin * np.bininvz) - np.mbinzlo; } else { iz = (int)(zin * np.bininvz) - np.mbinzlo - 1; } return (iz * np.mbiny * np.mbinx + iy * np.mbinx + ix + 1); } /* sorts the contents of a bin to make it comparable to the CPU version */ /* uses bubble sort since atoms per bin should be relatively small and can be done in situ */ __global__ void sort_bin_contents_kernel(int* bincount, int* bins, int mbins, int atoms_per_bin){ const int i = blockIdx.x * blockDim.x + threadIdx.x; if(i >= mbins) { return; } int atoms_in_bin = bincount[i]; int *bin_ptr = &bins[i * atoms_per_bin]; int sorted; do { sorted = 1; int tmp; for(int index = 0; index < atoms_in_bin - 1; index++){ if (bin_ptr[index] > bin_ptr[index + 1]){ tmp = bin_ptr[index]; bin_ptr[index] = bin_ptr[index + 1]; bin_ptr[index + 1] = tmp; sorted = 0; } } } while (!sorted); } __global__ void binatoms_kernel(Atom a, int* bincount, int* bins, int atoms_per_bin, Neighbor_params np, int *resize_needed){ Atom* atom = &a; const int i = blockIdx.x * blockDim.x + threadIdx.x; int nall = atom->Nlocal + atom->Nghost; if(i >= nall){ return; } MD_FLOAT x = atom_x(i); MD_FLOAT y = atom_y(i); MD_FLOAT z = atom_z(i); int ibin = coord2bin_device(x, y, z, np); int ac = atomicAdd(&bincount[ibin], 1); if(ac < atoms_per_bin){ bins[ibin * atoms_per_bin + ac] = i; } else { atomicMax(resize_needed, ac); } } __global__ void compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np, int nstencil, int* stencil, int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs, MD_FLOAT cutneighsq){ const int i = blockIdx.x * blockDim.x + threadIdx.x; const int Nlocal = a.Nlocal; if( i >= Nlocal ) { return; } Atom *atom = &a; Neighbor *neighbor = &neigh; int* neighptr = &(neighbor->neighbors[i]); int n = 0; MD_FLOAT xtmp = atom_x(i); MD_FLOAT ytmp = atom_y(i); MD_FLOAT ztmp = atom_z(i); int ibin = coord2bin_device(xtmp, ytmp, ztmp, np); #ifdef EXPLICIT_TYPES int type_i = atom->type[i]; #endif for(int k = 0; k < nstencil; k++) { int jbin = ibin + stencil[k]; int* loc_bin = &bins[jbin * atoms_per_bin]; for(int m = 0; m < bincount[jbin]; m++) { int j = loc_bin[m]; if ( j == i ){ continue; } MD_FLOAT delx = xtmp - atom_x(j); MD_FLOAT dely = ytmp - atom_y(j); MD_FLOAT delz = ztmp - atom_z(j); MD_FLOAT rsq = delx * delx + dely * dely + delz * delz; #ifdef EXPLICIT_TYPES int type_j = atom->type[j]; const MD_FLOAT cutoff = atom->cutneighsq[type_i * atom->ntypes + type_j]; #else const MD_FLOAT cutoff = cutneighsq; #endif if( rsq <= cutoff ) { int idx = atom->Nlocal * n; neighptr[idx] = j; n += 1; } } } neighbor->numneigh[i] = n; if(n > neighbor->maxneighs) { atomicMax(new_maxneighs, n); } } void binatoms_cuda(Atom *c_atom, Binning *c_binning, int *c_resize_needed, Neighbor_params *np, const int threads_per_block) { int nall = c_atom->Nlocal + c_atom->Nghost; int resize = 1; const int num_blocks = ceil((float) nall / (float) threads_per_block); while(resize > 0) { resize = 0; checkCUDAError("binatoms_cuda c_binning->bincount memset", cudaMemset(c_binning->bincount, 0, c_binning->mbins * sizeof(int))); checkCUDAError("binatoms_cuda c_resize_needed memset", cudaMemset(c_resize_needed, 0, sizeof(int)) ); /*binatoms_kernel(Atom a, int* bincount, int* bins, int c_binning->atoms_per_bin, Neighbor_params np, int *resize_needed) */ binatoms_kernel<<>>(*c_atom, c_binning->bincount, c_binning->bins, c_binning->atoms_per_bin, *np, c_resize_needed); checkCUDAError( "PeekAtLastError binatoms kernel", cudaPeekAtLastError() ); checkCUDAError( "DeviceSync binatoms kernel", cudaDeviceSynchronize() ); checkCUDAError("binatoms_cuda c_resize_needed memcpy back", cudaMemcpy(&resize, c_resize_needed, sizeof(int), cudaMemcpyDeviceToHost) ); if(resize) { cudaFree(c_binning->bins); c_binning->atoms_per_bin *= 2; checkCUDAError("binatoms_cuda c_binning->bins resize malloc", cudaMalloc(&c_binning->bins, c_binning->mbins * c_binning->atoms_per_bin * sizeof(int)) ); } } atoms_per_bin = c_binning->atoms_per_bin; const int sortBlocks = ceil((float)mbins / (float)threads_per_block); /*void sort_bin_contents_kernel(int* bincount, int* bins, int mbins, int atoms_per_bin)*/ sort_bin_contents_kernel<<>>(c_binning->bincount, c_binning->bins, c_binning->mbins, c_binning->atoms_per_bin); checkCUDAError( "PeekAtLastError sort_bin_contents kernel", cudaPeekAtLastError() ); checkCUDAError( "DeviceSync sort_bin_contents kernel", cudaDeviceSynchronize() ); } void buildNeighbor_cuda(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) { const int num_threads_per_block = get_num_threads(); int nall = atom->Nlocal + atom->Nghost; c_neighbor->maxneighs = neighbor->maxneighs; cudaProfilerStart(); /* upload stencil */ // TODO move all of this initialization into its own method if(c_stencil == NULL){ checkCUDAError( "buildNeighbor c_n_stencil malloc", cudaMalloc((void**)&c_stencil, nstencil * sizeof(int)) ); checkCUDAError( "buildNeighbor c_n_stencil memcpy", cudaMemcpy(c_stencil, stencil, nstencil * sizeof(int), cudaMemcpyHostToDevice )); } if(c_binning.mbins == 0){ c_binning.mbins = mbins; c_binning.atoms_per_bin = atoms_per_bin; checkCUDAError( "buildNeighbor c_binning->bincount malloc", cudaMalloc((void**)&(c_binning.bincount), c_binning.mbins * sizeof(int)) ); checkCUDAError( "buidlNeighbor c_binning->bins malloc", cudaMalloc((void**)&(c_binning.bins), c_binning.mbins * c_binning.atoms_per_bin * sizeof(int)) ); } Neighbor_params np { .xprd = xprd, .yprd = yprd, .zprd = zprd, .bininvx = bininvx, .bininvy = bininvy, .bininvz = bininvz, .mbinxlo = mbinxlo, .mbinylo = mbinylo, .mbinzlo = mbinzlo, .nbinx = nbinx, .nbiny = nbiny, .nbinz = nbinz, .mbinx = mbinx, .mbiny = mbiny, .mbinz = mbinz }; if(c_resize_needed == NULL){ checkCUDAError("buildNeighbor c_resize_needed malloc", cudaMalloc((void**)&c_resize_needed, sizeof(int)) ); } /* bin local & ghost atoms */ binatoms_cuda(c_atom, &c_binning, c_resize_needed, &np, num_threads_per_block); if(c_new_maxneighs == NULL){ checkCUDAError("c_new_maxneighs malloc", cudaMalloc((void**)&c_new_maxneighs, sizeof(int) )); } int resize = 1; /* extend c_neighbor arrays if necessary */ if(nall > nmax) { nmax = nall; if(c_neighbor->numneigh) cudaFree(c_neighbor->numneigh); if(c_neighbor->neighbors) cudaFree(c_neighbor->neighbors); checkCUDAError( "buildNeighbor c_numneigh malloc", cudaMalloc((void**)&(c_neighbor->numneigh), nmax * sizeof(int)) ); checkCUDAError( "buildNeighbor c_neighbors malloc", cudaMalloc((void**)&(c_neighbor->neighbors), nmax * c_neighbor->maxneighs * sizeof(int)) ); } /* loop over each atom, storing neighbors */ while(resize) { resize = 0; checkCUDAError("c_new_maxneighs memset", cudaMemset(c_new_maxneighs, 0, sizeof(int) )); // TODO call compute_neigborhood kernel here const int num_blocks = ceil((float)atom->Nlocal / (float)num_threads_per_block); /*compute_neighborhood(Atom a, Neighbor neigh, Neighbor_params np, int nstencil, int* stencil, int* bins, int atoms_per_bin, int *bincount, int *new_maxneighs) * */ compute_neighborhood<<>>(*c_atom, *c_neighbor, np, nstencil, c_stencil, c_binning.bins, c_binning.atoms_per_bin, c_binning.bincount, c_new_maxneighs, cutneighsq); checkCUDAError( "PeekAtLastError ComputeNeighbor", cudaPeekAtLastError() ); checkCUDAError( "DeviceSync ComputeNeighbor", cudaDeviceSynchronize() ); // TODO copy the value of c_new_maxneighs back to host and check if it has been modified int new_maxneighs; checkCUDAError("c_new_maxneighs memcpy back", cudaMemcpy(&new_maxneighs, c_new_maxneighs, sizeof(int), cudaMemcpyDeviceToHost)); if (new_maxneighs > c_neighbor->maxneighs){ resize = 1; } if(resize) { printf("RESIZE %d\n", c_neighbor->maxneighs); c_neighbor->maxneighs = new_maxneighs * 1.2; printf("NEW SIZE %d\n", c_neighbor->maxneighs); cudaFree(c_neighbor->neighbors); checkCUDAError("c_neighbor->neighbors resize malloc", cudaMalloc((void**)(&c_neighbor->neighbors), c_atom->Nmax * c_neighbor->maxneighs * sizeof(int))); } } neighbor->maxneighs = c_neighbor->maxneighs; cudaProfilerStop(); }