From c2bfa3ca3f7fd7445ae95ce19773ab2feeb6c70a Mon Sep 17 00:00:00 2001 From: Maximilian Gaul Date: Sat, 18 Dec 2021 13:02:04 +0100 Subject: [PATCH] Add scripts for perf measurement, made atom-memory allocation pinnend using 'cudaMallocHost', added measurements for atom pinnend memory --- evaluate_cpu_omp_perf.sh | 6 ++++ evaluate_gpu_perf_per_thread.sh | 5 +++ src/allocate.c | 21 ++++++++++++- src/atom.c | 11 ++++--- src/force.cu | 55 ++++++++++++++------------------- src/includes/allocate.h | 4 +++ 6 files changed, 65 insertions(+), 37 deletions(-) create mode 100644 evaluate_cpu_omp_perf.sh create mode 100644 evaluate_gpu_perf_per_thread.sh diff --git a/evaluate_cpu_omp_perf.sh b/evaluate_cpu_omp_perf.sh new file mode 100644 index 0000000..05a366f --- /dev/null +++ b/evaluate_cpu_omp_perf.sh @@ -0,0 +1,6 @@ +#!/bin/bash +for i in $(seq 1 32); do + echo "$i" + export "OMP_NUM_THREADS=$i" + ./MDBench-GCC -n 50 | grep "Performance" +done diff --git a/evaluate_gpu_perf_per_thread.sh b/evaluate_gpu_perf_per_thread.sh new file mode 100644 index 0000000..135a03b --- /dev/null +++ b/evaluate_gpu_perf_per_thread.sh @@ -0,0 +1,5 @@ +END=32 +for ((i=1;i<=END;i++)); do + output=$(eval "NUM_THREADS=$i ./MDBench-NVCC -n 50") + echo "$output" | grep 'atom updates per second' | sed 's/[^0-9.]//g' | awk '{print $1"e6"}' +done diff --git a/src/allocate.c b/src/allocate.c index 3aa4e1a..39199c7 100644 --- a/src/allocate.c +++ b/src/allocate.c @@ -25,11 +25,29 @@ #include #include +#include + +void checkCUDAError(const char *msg, cudaError_t err) +{ + if (err != cudaSuccess) + { + //print a human readable error message + printf("[CUDA ERROR %s]: %s\r\n", msg, cudaGetErrorString(err)); + exit(-1); + } +} + + void* allocate (int alignment, size_t bytesize) { int errorCode; void* ptr; + checkCUDAError( "allocate", cudaMallocHost((void**)&ptr, bytesize) ); + + return ptr; + + /* errorCode = posix_memalign(&ptr, alignment, bytesize); if (errorCode) { @@ -51,6 +69,7 @@ void* allocate (int alignment, size_t bytesize) } return ptr; + */ } void* reallocate ( @@ -63,7 +82,7 @@ void* reallocate ( if(ptr != NULL) { memcpy(newarray, ptr, oldBytesize); - free(ptr); + cudaFreeHost(ptr); } return newarray; diff --git a/src/atom.c b/src/atom.c index 1ed54e9..b1f7549 100644 --- a/src/atom.c +++ b/src/atom.c @@ -30,6 +30,9 @@ #include #include +#include +#include + #define DELTA 20000 void initAtom(Atom *atom) @@ -57,10 +60,10 @@ void createAtom(Atom *atom, Parameter *param) atom->Natoms = 4 * param->nx * param->ny * param->nz; atom->Nlocal = 0; atom->ntypes = param->ntypes; - atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); - atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); - atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); - atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); + checkCUDAError( "atom->epsilon cudaMallocHost", cudaMallocHost((void**)&(atom->epsilon), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); + checkCUDAError( "atom->sigma6 cudaMallocHost", cudaMallocHost((void**)&(atom->sigma6), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); + checkCUDAError( "atom->cutforcesq cudaMallocHost", cudaMallocHost((void**)&(atom->cutforcesq), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); + checkCUDAError( "atom->cutneighsq cudaMallocHost", cudaMallocHost((void**)&(atom->cutneighsq), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)); for(int i = 0; i < atom->ntypes * atom->ntypes; i++) { atom->epsilon[i] = param->epsilon; atom->sigma6[i] = param->sigma6; diff --git a/src/force.cu b/src/force.cu index a2c397e..66e115c 100644 --- a/src/force.cu +++ b/src/force.cu @@ -35,16 +35,7 @@ extern "C" { #include #include #include -} - -void checkError(const char *msg, cudaError_t err) -{ - if (err != cudaSuccess) - { - //print a human readable error message - printf("[CUDA ERROR %s]: %s\r\n", msg, cudaGetErrorString(err)); - exit(-1); - } + #include } // cuda kernel @@ -164,36 +155,36 @@ double computeForce( // HINT: Run with cuda-memcheck ./MDBench-NVCC in case of error // HINT: Only works for data layout = AOS!!! - checkError( "c_atom.x malloc", cudaMalloc((void**)&(c_atom.x), sizeof(MD_FLOAT) * atom->Nmax * 3) ); - checkError( "c_atom.x memcpy", cudaMemcpy(c_atom.x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) ); + checkCUDAError( "c_atom.x malloc", cudaMalloc((void**)&(c_atom.x), sizeof(MD_FLOAT) * atom->Nmax * 3) ); + checkCUDAError( "c_atom.x memcpy", cudaMemcpy(c_atom.x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) ); - checkError( "c_atom.fx malloc", cudaMalloc((void**)&(c_atom.fx), sizeof(MD_FLOAT) * Nlocal) ); - checkError( "c_atom.fx memcpy", cudaMemcpy(c_atom.fx, fx, sizeof(MD_FLOAT) * Nlocal, cudaMemcpyHostToDevice) ); + checkCUDAError( "c_atom.fx malloc", cudaMalloc((void**)&(c_atom.fx), sizeof(MD_FLOAT) * Nlocal) ); + checkCUDAError( "c_atom.fx memcpy", cudaMemcpy(c_atom.fx, fx, sizeof(MD_FLOAT) * Nlocal, cudaMemcpyHostToDevice) ); - checkError( "c_atom.fy malloc", cudaMalloc((void**)&(c_atom.fy), sizeof(MD_FLOAT) * Nlocal) ); - checkError( "c_atom.fy memcpy", cudaMemcpy(c_atom.fy, fy, sizeof(MD_FLOAT) * Nlocal, cudaMemcpyHostToDevice) ); + checkCUDAError( "c_atom.fy malloc", cudaMalloc((void**)&(c_atom.fy), sizeof(MD_FLOAT) * Nlocal) ); + checkCUDAError( "c_atom.fy memcpy", cudaMemcpy(c_atom.fy, fy, sizeof(MD_FLOAT) * Nlocal, cudaMemcpyHostToDevice) ); - checkError( "c_atom.fz malloc", cudaMalloc((void**)&(c_atom.fz), sizeof(MD_FLOAT) * Nlocal) ); - checkError( "c_atom.fz memcpy", cudaMemcpy(c_atom.fz, fz, sizeof(MD_FLOAT) * Nlocal, cudaMemcpyHostToDevice) ); + checkCUDAError( "c_atom.fz malloc", cudaMalloc((void**)&(c_atom.fz), sizeof(MD_FLOAT) * Nlocal) ); + checkCUDAError( "c_atom.fz memcpy", cudaMemcpy(c_atom.fz, fz, sizeof(MD_FLOAT) * Nlocal, cudaMemcpyHostToDevice) ); - checkError( "c_atom.type malloc", cudaMalloc((void**)&(c_atom.type), sizeof(int) * atom->Nmax) ); - checkError( "c_atom.type memcpy", cudaMemcpy(c_atom.type, atom->type, sizeof(int) * atom->Nmax, cudaMemcpyHostToDevice) ); + checkCUDAError( "c_atom.type malloc", cudaMalloc((void**)&(c_atom.type), sizeof(int) * atom->Nmax) ); + checkCUDAError( "c_atom.type memcpy", cudaMemcpy(c_atom.type, atom->type, sizeof(int) * atom->Nmax, cudaMemcpyHostToDevice) ); - checkError( "c_atom.epsilon malloc", cudaMalloc((void**)&(c_atom.epsilon), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) ); - checkError( "c_atom.epsilon memcpy", cudaMemcpy(c_atom.epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) ); + checkCUDAError( "c_atom.epsilon malloc", cudaMalloc((void**)&(c_atom.epsilon), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) ); + checkCUDAError( "c_atom.epsilon memcpy", cudaMemcpy(c_atom.epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) ); - checkError( "c_atom.sigma6 malloc", cudaMalloc((void**)&(c_atom.sigma6), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) ); - checkError( "c_atom.sigma6 memcpy", cudaMemcpy(c_atom.sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) ); + checkCUDAError( "c_atom.sigma6 malloc", cudaMalloc((void**)&(c_atom.sigma6), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) ); + checkCUDAError( "c_atom.sigma6 memcpy", cudaMemcpy(c_atom.sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) ); - checkError( "c_atom.cutforcesq malloc", cudaMalloc((void**)&(c_atom.cutforcesq), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) ); - checkError( "c_atom.cutforcesq memcpy", cudaMemcpy(c_atom.cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) ); + checkCUDAError( "c_atom.cutforcesq malloc", cudaMalloc((void**)&(c_atom.cutforcesq), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) ); + checkCUDAError( "c_atom.cutforcesq memcpy", cudaMemcpy(c_atom.cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) ); // double start_memory_bandwidth = getTimeStamp(); int *c_neighs; - checkError( "c_neighs malloc", cudaMalloc((void**)&c_neighs, sizeof(int) * Nlocal * neighbor->maxneighs) ); - checkError( "c_neighs memcpy", cudaMemcpy(c_neighs, neighbor->neighbors, sizeof(int) * Nlocal * neighbor->maxneighs, cudaMemcpyHostToDevice) ); + checkCUDAError( "c_neighs malloc", cudaMalloc((void**)&c_neighs, sizeof(int) * Nlocal * neighbor->maxneighs) ); + checkCUDAError( "c_neighs memcpy", cudaMemcpy(c_neighs, neighbor->neighbors, sizeof(int) * Nlocal * neighbor->maxneighs, cudaMemcpyHostToDevice) ); /* double end_memory_bandwidth = getTimeStamp(); @@ -204,8 +195,8 @@ double computeForce( */ int *c_neigh_numneigh; - checkError( "c_neigh_numneigh malloc", cudaMalloc((void**)&c_neigh_numneigh, sizeof(int) * Nlocal) ); - checkError( "c_neigh_numneigh memcpy", cudaMemcpy(c_neigh_numneigh, neighbor->numneigh, sizeof(int) * Nlocal, cudaMemcpyHostToDevice) ); + checkCUDAError( "c_neigh_numneigh malloc", cudaMalloc((void**)&c_neigh_numneigh, sizeof(int) * Nlocal) ); + checkCUDAError( "c_neigh_numneigh memcpy", cudaMemcpy(c_neigh_numneigh, neighbor->numneigh, sizeof(int) * Nlocal, cudaMemcpyHostToDevice) ); const int num_threads_per_block = num_threads; // this should be multiple of 32 as operations are performed at the level of warps const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block); @@ -216,8 +207,8 @@ double computeForce( calc_force <<< num_blocks, num_threads_per_block >>> (c_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, c_neighs, c_neigh_numneigh); - checkError( "PeekAtLastError", cudaPeekAtLastError() ); - checkError( "DeviceSync", cudaDeviceSynchronize() ); + checkCUDAError( "PeekAtLastError", cudaPeekAtLastError() ); + checkCUDAError( "DeviceSync", cudaDeviceSynchronize() ); // copy results in c_atom.fx/fy/fz to atom->fx/fy/fz cudaMemcpy(atom->fx, c_atom.fx, sizeof(MD_FLOAT) * Nlocal, cudaMemcpyDeviceToHost); diff --git a/src/includes/allocate.h b/src/includes/allocate.h index b7587e3..dc7a929 100644 --- a/src/includes/allocate.h +++ b/src/includes/allocate.h @@ -22,8 +22,12 @@ */ #include +#include + #ifndef __ALLOCATE_H_ #define __ALLOCATE_H_ extern void* allocate (int alignment, size_t bytesize); extern void* reallocate (void* ptr, int alignment, size_t newBytesize, size_t oldBytesize); + +extern void checkCUDAError(const char *msg, cudaError_t err); #endif