Add scripts for perf measurement, made atom-memory allocation pinnend using 'cudaMallocHost', added measurements for atom pinnend memory
This commit is contained in:
parent
2a099da5b7
commit
c2bfa3ca3f
6
evaluate_cpu_omp_perf.sh
Normal file
6
evaluate_cpu_omp_perf.sh
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
for i in $(seq 1 32); do
|
||||||
|
echo "$i"
|
||||||
|
export "OMP_NUM_THREADS=$i"
|
||||||
|
./MDBench-GCC -n 50 | grep "Performance"
|
||||||
|
done
|
5
evaluate_gpu_perf_per_thread.sh
Normal file
5
evaluate_gpu_perf_per_thread.sh
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
END=32
|
||||||
|
for ((i=1;i<=END;i++)); do
|
||||||
|
output=$(eval "NUM_THREADS=$i ./MDBench-NVCC -n 50")
|
||||||
|
echo "$output" | grep 'atom updates per second' | sed 's/[^0-9.]//g' | awk '{print $1"e6"}'
|
||||||
|
done
|
@ -25,11 +25,29 @@
|
|||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
|
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
|
void checkCUDAError(const char *msg, cudaError_t err)
|
||||||
|
{
|
||||||
|
if (err != cudaSuccess)
|
||||||
|
{
|
||||||
|
//print a human readable error message
|
||||||
|
printf("[CUDA ERROR %s]: %s\r\n", msg, cudaGetErrorString(err));
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void* allocate (int alignment, size_t bytesize)
|
void* allocate (int alignment, size_t bytesize)
|
||||||
{
|
{
|
||||||
int errorCode;
|
int errorCode;
|
||||||
void* ptr;
|
void* ptr;
|
||||||
|
|
||||||
|
checkCUDAError( "allocate", cudaMallocHost((void**)&ptr, bytesize) );
|
||||||
|
|
||||||
|
return ptr;
|
||||||
|
|
||||||
|
/*
|
||||||
errorCode = posix_memalign(&ptr, alignment, bytesize);
|
errorCode = posix_memalign(&ptr, alignment, bytesize);
|
||||||
|
|
||||||
if (errorCode) {
|
if (errorCode) {
|
||||||
@ -51,6 +69,7 @@ void* allocate (int alignment, size_t bytesize)
|
|||||||
}
|
}
|
||||||
|
|
||||||
return ptr;
|
return ptr;
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
void* reallocate (
|
void* reallocate (
|
||||||
@ -63,7 +82,7 @@ void* reallocate (
|
|||||||
|
|
||||||
if(ptr != NULL) {
|
if(ptr != NULL) {
|
||||||
memcpy(newarray, ptr, oldBytesize);
|
memcpy(newarray, ptr, oldBytesize);
|
||||||
free(ptr);
|
cudaFreeHost(ptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
return newarray;
|
return newarray;
|
||||||
|
11
src/atom.c
11
src/atom.c
@ -30,6 +30,9 @@
|
|||||||
#include <allocate.h>
|
#include <allocate.h>
|
||||||
#include <util.h>
|
#include <util.h>
|
||||||
|
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
#include <device_launch_parameters.h>
|
||||||
|
|
||||||
#define DELTA 20000
|
#define DELTA 20000
|
||||||
|
|
||||||
void initAtom(Atom *atom)
|
void initAtom(Atom *atom)
|
||||||
@ -57,10 +60,10 @@ void createAtom(Atom *atom, Parameter *param)
|
|||||||
atom->Natoms = 4 * param->nx * param->ny * param->nz;
|
atom->Natoms = 4 * param->nx * param->ny * param->nz;
|
||||||
atom->Nlocal = 0;
|
atom->Nlocal = 0;
|
||||||
atom->ntypes = param->ntypes;
|
atom->ntypes = param->ntypes;
|
||||||
atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
checkCUDAError( "atom->epsilon cudaMallocHost", cudaMallocHost((void**)&(atom->epsilon), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->epsilon = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||||
atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
checkCUDAError( "atom->sigma6 cudaMallocHost", cudaMallocHost((void**)&(atom->sigma6), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->sigma6 = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||||
atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
checkCUDAError( "atom->cutforcesq cudaMallocHost", cudaMallocHost((void**)&(atom->cutforcesq), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->cutforcesq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||||
atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
checkCUDAError( "atom->cutneighsq cudaMallocHost", cudaMallocHost((void**)&(atom->cutneighsq), atom->ntypes * atom->ntypes * sizeof(MD_FLOAT)) ); // atom->cutneighsq = allocate(ALIGNMENT, atom->ntypes * atom->ntypes * sizeof(MD_FLOAT));
|
||||||
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
|
for(int i = 0; i < atom->ntypes * atom->ntypes; i++) {
|
||||||
atom->epsilon[i] = param->epsilon;
|
atom->epsilon[i] = param->epsilon;
|
||||||
atom->sigma6[i] = param->sigma6;
|
atom->sigma6[i] = param->sigma6;
|
||||||
|
55
src/force.cu
55
src/force.cu
@ -35,16 +35,7 @@ extern "C" {
|
|||||||
#include <neighbor.h>
|
#include <neighbor.h>
|
||||||
#include <parameter.h>
|
#include <parameter.h>
|
||||||
#include <atom.h>
|
#include <atom.h>
|
||||||
}
|
#include <allocate.h>
|
||||||
|
|
||||||
void checkError(const char *msg, cudaError_t err)
|
|
||||||
{
|
|
||||||
if (err != cudaSuccess)
|
|
||||||
{
|
|
||||||
//print a human readable error message
|
|
||||||
printf("[CUDA ERROR %s]: %s\r\n", msg, cudaGetErrorString(err));
|
|
||||||
exit(-1);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// cuda kernel
|
// cuda kernel
|
||||||
@ -164,36 +155,36 @@ double computeForce(
|
|||||||
// HINT: Run with cuda-memcheck ./MDBench-NVCC in case of error
|
// HINT: Run with cuda-memcheck ./MDBench-NVCC in case of error
|
||||||
// HINT: Only works for data layout = AOS!!!
|
// HINT: Only works for data layout = AOS!!!
|
||||||
|
|
||||||
checkError( "c_atom.x malloc", cudaMalloc((void**)&(c_atom.x), sizeof(MD_FLOAT) * atom->Nmax * 3) );
|
checkCUDAError( "c_atom.x malloc", cudaMalloc((void**)&(c_atom.x), sizeof(MD_FLOAT) * atom->Nmax * 3) );
|
||||||
checkError( "c_atom.x memcpy", cudaMemcpy(c_atom.x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) );
|
checkCUDAError( "c_atom.x memcpy", cudaMemcpy(c_atom.x, atom->x, sizeof(MD_FLOAT) * atom->Nmax * 3, cudaMemcpyHostToDevice) );
|
||||||
|
|
||||||
checkError( "c_atom.fx malloc", cudaMalloc((void**)&(c_atom.fx), sizeof(MD_FLOAT) * Nlocal) );
|
checkCUDAError( "c_atom.fx malloc", cudaMalloc((void**)&(c_atom.fx), sizeof(MD_FLOAT) * Nlocal) );
|
||||||
checkError( "c_atom.fx memcpy", cudaMemcpy(c_atom.fx, fx, sizeof(MD_FLOAT) * Nlocal, cudaMemcpyHostToDevice) );
|
checkCUDAError( "c_atom.fx memcpy", cudaMemcpy(c_atom.fx, fx, sizeof(MD_FLOAT) * Nlocal, cudaMemcpyHostToDevice) );
|
||||||
|
|
||||||
checkError( "c_atom.fy malloc", cudaMalloc((void**)&(c_atom.fy), sizeof(MD_FLOAT) * Nlocal) );
|
checkCUDAError( "c_atom.fy malloc", cudaMalloc((void**)&(c_atom.fy), sizeof(MD_FLOAT) * Nlocal) );
|
||||||
checkError( "c_atom.fy memcpy", cudaMemcpy(c_atom.fy, fy, sizeof(MD_FLOAT) * Nlocal, cudaMemcpyHostToDevice) );
|
checkCUDAError( "c_atom.fy memcpy", cudaMemcpy(c_atom.fy, fy, sizeof(MD_FLOAT) * Nlocal, cudaMemcpyHostToDevice) );
|
||||||
|
|
||||||
checkError( "c_atom.fz malloc", cudaMalloc((void**)&(c_atom.fz), sizeof(MD_FLOAT) * Nlocal) );
|
checkCUDAError( "c_atom.fz malloc", cudaMalloc((void**)&(c_atom.fz), sizeof(MD_FLOAT) * Nlocal) );
|
||||||
checkError( "c_atom.fz memcpy", cudaMemcpy(c_atom.fz, fz, sizeof(MD_FLOAT) * Nlocal, cudaMemcpyHostToDevice) );
|
checkCUDAError( "c_atom.fz memcpy", cudaMemcpy(c_atom.fz, fz, sizeof(MD_FLOAT) * Nlocal, cudaMemcpyHostToDevice) );
|
||||||
|
|
||||||
checkError( "c_atom.type malloc", cudaMalloc((void**)&(c_atom.type), sizeof(int) * atom->Nmax) );
|
checkCUDAError( "c_atom.type malloc", cudaMalloc((void**)&(c_atom.type), sizeof(int) * atom->Nmax) );
|
||||||
checkError( "c_atom.type memcpy", cudaMemcpy(c_atom.type, atom->type, sizeof(int) * atom->Nmax, cudaMemcpyHostToDevice) );
|
checkCUDAError( "c_atom.type memcpy", cudaMemcpy(c_atom.type, atom->type, sizeof(int) * atom->Nmax, cudaMemcpyHostToDevice) );
|
||||||
|
|
||||||
checkError( "c_atom.epsilon malloc", cudaMalloc((void**)&(c_atom.epsilon), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
|
checkCUDAError( "c_atom.epsilon malloc", cudaMalloc((void**)&(c_atom.epsilon), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
|
||||||
checkError( "c_atom.epsilon memcpy", cudaMemcpy(c_atom.epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
|
checkCUDAError( "c_atom.epsilon memcpy", cudaMemcpy(c_atom.epsilon, atom->epsilon, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
|
||||||
|
|
||||||
checkError( "c_atom.sigma6 malloc", cudaMalloc((void**)&(c_atom.sigma6), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
|
checkCUDAError( "c_atom.sigma6 malloc", cudaMalloc((void**)&(c_atom.sigma6), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
|
||||||
checkError( "c_atom.sigma6 memcpy", cudaMemcpy(c_atom.sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
|
checkCUDAError( "c_atom.sigma6 memcpy", cudaMemcpy(c_atom.sigma6, atom->sigma6, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
|
||||||
|
|
||||||
checkError( "c_atom.cutforcesq malloc", cudaMalloc((void**)&(c_atom.cutforcesq), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
|
checkCUDAError( "c_atom.cutforcesq malloc", cudaMalloc((void**)&(c_atom.cutforcesq), sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes) );
|
||||||
checkError( "c_atom.cutforcesq memcpy", cudaMemcpy(c_atom.cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
|
checkCUDAError( "c_atom.cutforcesq memcpy", cudaMemcpy(c_atom.cutforcesq, atom->cutforcesq, sizeof(MD_FLOAT) * atom->ntypes * atom->ntypes, cudaMemcpyHostToDevice) );
|
||||||
|
|
||||||
|
|
||||||
// double start_memory_bandwidth = getTimeStamp();
|
// double start_memory_bandwidth = getTimeStamp();
|
||||||
|
|
||||||
int *c_neighs;
|
int *c_neighs;
|
||||||
checkError( "c_neighs malloc", cudaMalloc((void**)&c_neighs, sizeof(int) * Nlocal * neighbor->maxneighs) );
|
checkCUDAError( "c_neighs malloc", cudaMalloc((void**)&c_neighs, sizeof(int) * Nlocal * neighbor->maxneighs) );
|
||||||
checkError( "c_neighs memcpy", cudaMemcpy(c_neighs, neighbor->neighbors, sizeof(int) * Nlocal * neighbor->maxneighs, cudaMemcpyHostToDevice) );
|
checkCUDAError( "c_neighs memcpy", cudaMemcpy(c_neighs, neighbor->neighbors, sizeof(int) * Nlocal * neighbor->maxneighs, cudaMemcpyHostToDevice) );
|
||||||
|
|
||||||
/*
|
/*
|
||||||
double end_memory_bandwidth = getTimeStamp();
|
double end_memory_bandwidth = getTimeStamp();
|
||||||
@ -204,8 +195,8 @@ double computeForce(
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
int *c_neigh_numneigh;
|
int *c_neigh_numneigh;
|
||||||
checkError( "c_neigh_numneigh malloc", cudaMalloc((void**)&c_neigh_numneigh, sizeof(int) * Nlocal) );
|
checkCUDAError( "c_neigh_numneigh malloc", cudaMalloc((void**)&c_neigh_numneigh, sizeof(int) * Nlocal) );
|
||||||
checkError( "c_neigh_numneigh memcpy", cudaMemcpy(c_neigh_numneigh, neighbor->numneigh, sizeof(int) * Nlocal, cudaMemcpyHostToDevice) );
|
checkCUDAError( "c_neigh_numneigh memcpy", cudaMemcpy(c_neigh_numneigh, neighbor->numneigh, sizeof(int) * Nlocal, cudaMemcpyHostToDevice) );
|
||||||
|
|
||||||
const int num_threads_per_block = num_threads; // this should be multiple of 32 as operations are performed at the level of warps
|
const int num_threads_per_block = num_threads; // this should be multiple of 32 as operations are performed at the level of warps
|
||||||
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
const int num_blocks = ceil((float)Nlocal / (float)num_threads_per_block);
|
||||||
@ -216,8 +207,8 @@ double computeForce(
|
|||||||
|
|
||||||
calc_force <<< num_blocks, num_threads_per_block >>> (c_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, c_neighs, c_neigh_numneigh);
|
calc_force <<< num_blocks, num_threads_per_block >>> (c_atom, cutforcesq, sigma6, epsilon, Nlocal, neighbor->maxneighs, c_neighs, c_neigh_numneigh);
|
||||||
|
|
||||||
checkError( "PeekAtLastError", cudaPeekAtLastError() );
|
checkCUDAError( "PeekAtLastError", cudaPeekAtLastError() );
|
||||||
checkError( "DeviceSync", cudaDeviceSynchronize() );
|
checkCUDAError( "DeviceSync", cudaDeviceSynchronize() );
|
||||||
|
|
||||||
// copy results in c_atom.fx/fy/fz to atom->fx/fy/fz
|
// copy results in c_atom.fx/fy/fz to atom->fx/fy/fz
|
||||||
cudaMemcpy(atom->fx, c_atom.fx, sizeof(MD_FLOAT) * Nlocal, cudaMemcpyDeviceToHost);
|
cudaMemcpy(atom->fx, c_atom.fx, sizeof(MD_FLOAT) * Nlocal, cudaMemcpyDeviceToHost);
|
||||||
|
@ -22,8 +22,12 @@
|
|||||||
*/
|
*/
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
#ifndef __ALLOCATE_H_
|
#ifndef __ALLOCATE_H_
|
||||||
#define __ALLOCATE_H_
|
#define __ALLOCATE_H_
|
||||||
extern void* allocate (int alignment, size_t bytesize);
|
extern void* allocate (int alignment, size_t bytesize);
|
||||||
extern void* reallocate (void* ptr, int alignment, size_t newBytesize, size_t oldBytesize);
|
extern void* reallocate (void* ptr, int alignment, size_t newBytesize, size_t oldBytesize);
|
||||||
|
|
||||||
|
extern void checkCUDAError(const char *msg, cudaError_t err);
|
||||||
#endif
|
#endif
|
||||||
|
Loading…
Reference in New Issue
Block a user