/* * ======================================================================================= * * Author: Jan Eitzinger (je), jan.eitzinger@fau.de * Copyright (c) 2020 RRZE, University Erlangen-Nuremberg * * This file is part of MD-Bench. * * MD-Bench is free software: you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published * by the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * MD-Bench is distributed in the hope that it will be useful, but WITHOUT ANY * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A * PARTICULAR PURPOSE. See the GNU Lesser General Public License for more * details. * * You should have received a copy of the GNU Lesser General Public License along * with MD-Bench. If not, see . * ======================================================================================= */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define HLINE "----------------------------------------------------------------------------\n" extern void cuda_final_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom, const int num_threads_per_block); extern void cuda_initial_integrate(bool doReneighbour, Parameter *param, Atom *atom, Atom *c_atom, const int num_threads_per_block); extern double computeForce(bool, Parameter*, Atom*, Neighbor*, Atom*, Neighbor*, const int); extern double computeForceTracing(Parameter*, Atom*, Neighbor*, Stats*, int, int); extern double computeForceEam(Eam* eam, Parameter*, Atom *atom, Neighbor *neighbor, Stats *stats, int first_exec, int timestep); void init(Parameter *param) { param->input_file = NULL; param->vtk_file = NULL; param->force_field = FF_LJ; param->epsilon = 1.0; param->sigma6 = 1.0; param->rho = 0.8442; param->ntypes = 4; param->ntimes = 200; param->dt = 0.005; param->nx = 32; param->ny = 32; param->nz = 32; param->cutforce = 2.5; param->cutneigh = param->cutforce + 0.30; param->temp = 1.44; param->nstat = 100; param->mass = 1.0; param->dtforce = 0.5 * param->dt; param->every = 20; param->proc_freq = 2.4; } void initCudaAtom(Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor) { c_atom->Natoms = atom->Natoms; c_atom->Nlocal = atom->Nlocal; c_atom->Nghost = atom->Nghost; c_atom->Nmax = atom->Nmax; c_atom->ntypes = atom->ntypes; c_atom->border_map = NULL; const int Nlocal = atom->Nlocal; checkCUDAError( "c_atom->x malloc", cudaMalloc((void**)&(c_atom->x), sizeof(MD_FLOAT) * atom->Nmax * 3) ); checkCUDAError( "c_atom->x memcpy", cudaMemcpy(c_atom->x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) ); checkCUDAError( "c_atom->fx malloc", cudaMalloc((void**)&(c_atom->fx), sizeof(MD_FLOAT) * Nlocal * 3) ); checkCUDAError( "c_atom->vx malloc", cudaMalloc((void**)&(c_atom->vx), sizeof(MD_FLOAT) * Nlocal * 3) ); checkCUDAError( "c_atom->vx memcpy", cudaMemcpy(c_atom->vx, atom->vx, sizeof(MD_FLOAT) * Nlocal * 3, cudaMemcpyHostToDevice) ); checkCUDAError( "c_atom->type malloc", cudaMalloc((void**)&(c_atom->type), sizeof(int) * atom->Nmax) ); checkCUDAError( "c_atom->epsilon malloc", cudaMalloc((void**)&(c_atom->epsilon), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) ); checkCUDAError( "c_atom->sigma6 malloc", cudaMalloc((void**)&(c_atom->sigma6), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) ); checkCUDAError( "c_atom->cutforcesq malloc", cudaMalloc((void**)&(c_atom->cutforcesq), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) ); checkCUDAError( "c_neighbor->neighbors malloc", cudaMalloc((void**)&c_neighbor->neighbors, sizeof(int) * Nlocal * neighbor->maxneighs) ); checkCUDAError( "c_neighbor->numneigh malloc", cudaMalloc((void**)&c_neighbor->numneigh, sizeof(int) * Nlocal) ); checkCUDAError( "c_atom->type memcpy", cudaMemcpy(c_atom->type, atom->type, sizeof(int) * atom->Nmax, cudaMemcpyHostToDevice) ); checkCUDAError( "c_atom->sigma6 memcpy", cudaMemcpy(c_atom->sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) ); checkCUDAError( "c_atom->epsilon memcpy", cudaMemcpy(c_atom->epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) ); checkCUDAError( "c_atom->cutforcesq memcpy", cudaMemcpy(c_atom->cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) ); } double setup( Parameter *param, Eam *eam, Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor, Stats *stats, const int num_threads_per_block) { if(param->force_field == FF_EAM) { initEam(eam, param); } double S, E; param->lattice = pow((4.0 / param->rho), (1.0 / 3.0)); param->xprd = param->nx * param->lattice; param->yprd = param->ny * param->lattice; param->zprd = param->nz * param->lattice; S = getTimeStamp(); initAtom(atom); initNeighbor(neighbor, param); initPbc(atom); initStats(stats); setupNeighbor(); createAtom(atom, param); setupThermo(param, atom->Natoms); adjustThermo(param, atom); setupPbc(atom, param); initCudaAtom(atom, neighbor, c_atom, c_neighbor); updatePbc_cuda(atom, param, c_atom, true, num_threads_per_block); buildNeighbor_cuda(atom, neighbor, c_atom, c_neighbor, num_threads_per_block); E = getTimeStamp(); return E-S; } double reneighbour( Parameter *param, Atom *atom, Neighbor *neighbor, Atom *c_atom, Neighbor *c_neighbor, const int num_threads_per_block) { double S, E; S = getTimeStamp(); LIKWID_MARKER_START("reneighbour"); updateAtomsPbc_cuda(atom, param, c_atom, num_threads_per_block); setupPbc(atom, param); updatePbc_cuda(atom, param, c_atom, true, num_threads_per_block); //sortAtom(atom); buildNeighbor_cuda(atom, neighbor, c_atom, c_neighbor, num_threads_per_block); LIKWID_MARKER_STOP("reneighbour"); E = getTimeStamp(); return E-S; } void initialIntegrate(Parameter *param, Atom *atom) { for(int i = 0; i < atom->Nlocal; i++) { atom_vx(i) += param->dtforce * atom_fx(i); atom_vy(i) += param->dtforce * atom_fy(i); atom_vz(i) += param->dtforce * atom_fz(i); atom_x(i) = atom_x(i) + param->dt * atom_vx(i); atom_y(i) = atom_y(i) + param->dt * atom_vy(i); atom_z(i) = atom_z(i) + param->dt * atom_vz(i); } } void finalIntegrate(Parameter *param, Atom *atom) { for(int i = 0; i < atom->Nlocal; i++) { atom_vx(i) += param->dtforce * atom_fx(i); atom_vy(i) += param->dtforce * atom_fy(i); atom_vz(i) += param->dtforce * atom_fz(i); } } void printAtomState(Atom *atom) { printf("Atom counts: Natoms=%d Nlocal=%d Nghost=%d Nmax=%d\n", atom->Natoms, atom->Nlocal, atom->Nghost, atom->Nmax); /* int nall = atom->Nlocal + atom->Nghost; */ /* for (int i=0; ix[i], atom->y[i], atom->z[i]); */ /* } */ } int str2ff(const char *string) { if(strncmp(string, "lj", 2) == 0) return FF_LJ; if(strncmp(string, "eam", 3) == 0) return FF_EAM; return -1; } const char* ff2str(int ff) { if(ff == FF_LJ) { return "lj"; } if(ff == FF_EAM) { return "eam"; } return "invalid"; } int get_num_threads() { const char *num_threads_env = getenv("NUM_THREADS"); int num_threads = 0; if(num_threads_env == 0) num_threads = 32; else { num_threads = atoi(num_threads_env); } return num_threads; } int main(int argc, char** argv) { double timer[NUMTIMER]; Eam eam; Atom atom; Neighbor neighbor; Stats stats; Parameter param; Atom c_atom; Neighbor c_neighbor; LIKWID_MARKER_INIT; #pragma omp parallel { LIKWID_MARKER_REGISTER("force"); //LIKWID_MARKER_REGISTER("reneighbour"); //LIKWID_MARKER_REGISTER("pbc"); } init(¶m); for(int i = 0; i < argc; i++) { if((strcmp(argv[i], "-f") == 0)) { if((param.force_field = str2ff(argv[++i])) < 0) { fprintf(stderr, "Invalid force field!\n"); exit(-1); } continue; } if((strcmp(argv[i], "-i") == 0)) { param.input_file = strdup(argv[++i]); continue; } if((strcmp(argv[i], "-n") == 0) || (strcmp(argv[i], "--nsteps") == 0)) { param.ntimes = atoi(argv[++i]); continue; } if((strcmp(argv[i], "-nx") == 0)) { param.nx = atoi(argv[++i]); continue; } if((strcmp(argv[i], "-ny") == 0)) { param.ny = atoi(argv[++i]); continue; } if((strcmp(argv[i], "-nz") == 0)) { param.nz = atoi(argv[++i]); continue; } if((strcmp(argv[i], "--freq") == 0)) { param.proc_freq = atof(argv[++i]); continue; } if((strcmp(argv[i], "--vtk") == 0)) { param.vtk_file = strdup(argv[++i]); continue; } if((strcmp(argv[i], "-h") == 0) || (strcmp(argv[i], "--help") == 0)) { printf("MD Bench: A minimalistic re-implementation of miniMD\n"); printf(HLINE); printf("-f : force field (lj or eam), default lj\n"); printf("-i : input file for EAM\n"); printf("-n / --nsteps : set number of timesteps for simulation\n"); printf("-nx/-ny/-nz : set linear dimension of systembox in x/y/z direction\n"); printf("--freq : processor frequency (GHz)\n"); printf("--vtk : VTK file for visualization\n"); printf(HLINE); exit(EXIT_SUCCESS); } } // this should be multiple of 32 as operations are performed at the level of warps const int num_threads_per_block = get_num_threads(); setup(¶m, &eam, &atom, &neighbor, &c_atom, &c_neighbor, &stats, num_threads_per_block); computeThermo(0, ¶m, &atom); if(param.force_field == FF_EAM) { computeForceEam(&eam, ¶m, &atom, &neighbor, &stats, 1, 0); } else { #if defined(MEM_TRACER) || defined(INDEX_TRACER) || defined(COMPUTE_STATS) computeForceTracing(¶m, &atom, &neighbor, &stats, 1, 0); #else computeForce(true, ¶m, &atom, &neighbor, &c_atom, &c_neighbor, num_threads_per_block); #endif } timer[FORCE] = 0.0; timer[NEIGH] = 0.0; timer[TOTAL] = getTimeStamp(); if(param.vtk_file != NULL) { write_atoms_to_vtk_file(param.vtk_file, &atom, 0); } for(int n = 0; n < param.ntimes; n++) { const bool doReneighbour = (n + 1) % param.every == 0; cuda_initial_integrate(doReneighbour, ¶m, &atom, &c_atom, num_threads_per_block); if(doReneighbour) { timer[NEIGH] += reneighbour(¶m, &atom, &neighbor, &c_atom, &c_neighbor, num_threads_per_block); } else { updatePbc_cuda(&atom, ¶m, &c_atom, false, num_threads_per_block); } if(param.force_field == FF_EAM) { timer[FORCE] += computeForceEam(&eam, ¶m, &atom, &neighbor, &stats, 0, n + 1); } else { #if defined(MEM_TRACER) || defined(INDEX_TRACER) || defined(COMPUTE_STATS) timer[FORCE] += computeForceTracing(¶m, &atom, &neighbor, &stats, 0, n + 1); #else timer[FORCE] += computeForce(doReneighbour, ¶m, &atom, &neighbor, &c_atom, &c_neighbor, num_threads_per_block); #endif } cuda_final_integrate(doReneighbour, ¶m, &atom, &c_atom, num_threads_per_block); if(!((n + 1) % param.nstat) && (n+1) < param.ntimes) { checkCUDAError("computeThermo atom->x memcpy back", cudaMemcpy(atom.x, c_atom.x, atom.Nmax * sizeof(MD_FLOAT) * 3, cudaMemcpyDeviceToHost) ); computeThermo(n + 1, ¶m, &atom); } if(param.vtk_file != NULL) { write_atoms_to_vtk_file(param.vtk_file, &atom, n + 1); } } timer[TOTAL] = getTimeStamp() - timer[TOTAL]; computeThermo(-1, ¶m, &atom); printf(HLINE); printf("Force field: %s\n", ff2str(param.force_field)); printf("Data layout for positions: %s\n", POS_DATA_LAYOUT); #if PRECISION == 1 printf("Using single precision floating point.\n"); #else printf("Using double precision floating point.\n"); #endif printf(HLINE); printf("System: %d atoms %d ghost atoms, Steps: %d\n", atom.Natoms, atom.Nghost, param.ntimes); printf("TOTAL %.2fs FORCE %.2fs NEIGH %.2fs REST %.2fs\n", timer[TOTAL], timer[FORCE], timer[NEIGH], timer[TOTAL]-timer[FORCE]-timer[NEIGH]); printf(HLINE); printf("Performance: %.2f million atom updates per second\n", 1e-6 * (double) atom.Natoms * param.ntimes / timer[TOTAL]); #ifdef COMPUTE_STATS displayStatistics(&atom, ¶m, &stats, timer); #endif LIKWID_MARKER_CLOSE; return EXIT_SUCCESS; }