forked from moebiusband/NuSiF-Solver
checkpoint 2D-omp: Improved performance
This commit is contained in:
parent
5dd7f83dc5
commit
623b866f00
@ -25,7 +25,6 @@ int main(int argc, char** argv)
|
|||||||
Parameter params;
|
Parameter params;
|
||||||
Solver solver;
|
Solver solver;
|
||||||
initParameter(¶ms);
|
initParameter(¶ms);
|
||||||
LIKWID_MARKER_INIT;
|
|
||||||
#pragma omp parallel
|
#pragma omp parallel
|
||||||
{
|
{
|
||||||
if (dummy == 1 || omp_get_thread_num() == 0)
|
if (dummy == 1 || omp_get_thread_num() == 0)
|
||||||
@ -39,10 +38,11 @@ int main(int argc, char** argv)
|
|||||||
readParameter(¶ms, argv[1]);
|
readParameter(¶ms, argv[1]);
|
||||||
|
|
||||||
initSolver(&solver, ¶ms, 2);
|
initSolver(&solver, ¶ms, 2);
|
||||||
LIKWID_PROFILE("RB", solveRB);
|
startTime = getTimeStamp();
|
||||||
|
solveRB(&solver);
|
||||||
|
endTime = getTimeStamp();
|
||||||
printf(" %.2fs\n", endTime - startTime);
|
printf(" %.2fs\n", endTime - startTime);
|
||||||
writeResult(&solver, "p.dat");
|
writeResult(&solver, "p.dat");
|
||||||
|
|
||||||
LIKWID_MARKER_CLOSE;
|
|
||||||
return EXIT_SUCCESS;
|
return EXIT_SUCCESS;
|
||||||
}
|
}
|
||||||
|
@ -116,87 +116,95 @@ void initSolver(Solver* solver, Parameter* params, int problem)
|
|||||||
|
|
||||||
void solveRB(Solver* solver)
|
void solveRB(Solver* solver)
|
||||||
{
|
{
|
||||||
int imax = solver->imax;
|
|
||||||
int jmax = solver->jmax;
|
const int imax = solver->imax;
|
||||||
double eps = solver->eps;
|
const int jmax = solver->jmax;
|
||||||
int itermax = solver->itermax;
|
const int itermax = solver->itermax;
|
||||||
double dx2 = solver->dx * solver->dx;
|
const double epssq = solver->eps * solver->eps;
|
||||||
double dy2 = solver->dy * solver->dy;
|
|
||||||
double idx2 = 1.0 / dx2;
|
const double dx2 = solver->dx * solver->dx;
|
||||||
double idy2 = 1.0 / dy2;
|
const double dy2 = solver->dy * solver->dy;
|
||||||
double factor = solver->omega * 0.5 * (dx2 * dy2) / (dx2 + dy2);
|
const double idx2 = 1.0 / dx2;
|
||||||
double* p = solver->p;
|
const double idy2 = 1.0 / dy2;
|
||||||
double* rhs = solver->rhs;
|
const double factor = solver->omega * 0.5 * (dx2 * dy2) / (dx2 + dy2);
|
||||||
double epssq = eps * eps;
|
|
||||||
int it = 0;
|
double* __restrict p = solver->p;
|
||||||
double res = 1.0;
|
double* __restrict rhs = solver->rhs;
|
||||||
int dim[2] = { 0 };
|
|
||||||
int num_threads = 1;
|
int dim[2] = { 0 };
|
||||||
#pragma omp parallel
|
#pragma omp parallel
|
||||||
|
#pragma omp single
|
||||||
{
|
{
|
||||||
#pragma omp critical
|
omp_create_dim(omp_get_num_threads(), dim);
|
||||||
num_threads = omp_get_num_threads();
|
|
||||||
}
|
}
|
||||||
omp_create_dim(num_threads, dim);
|
|
||||||
printf("%d: { %d, %d}\n", num_threads, dim[0], dim[1]);
|
double res = 0.0;
|
||||||
while ((res >= epssq) && (it < itermax)) {
|
|
||||||
#pragma omp parallel
|
for (int it = 0; it < itermax; ++it) {
|
||||||
|
|
||||||
|
res = 0.0;
|
||||||
|
|
||||||
|
#pragma omp parallel reduction(+ : res)
|
||||||
{
|
{
|
||||||
res = 0.0;
|
const int tid = omp_get_thread_num();
|
||||||
int jsw, isw;
|
|
||||||
double local_res = 0.0;
|
const int li_start = get_dim_start(get_x_choord(tid, dim), dim[0], imax);
|
||||||
int li_start = get_dim_start(get_x_choord(omp_get_thread_num(), dim),
|
const int lj_start = get_dim_start(get_y_choord(tid, dim), dim[1], jmax);
|
||||||
dim[0],
|
|
||||||
solver->imax);
|
const int limax = li_start +
|
||||||
int lj_start = get_dim_start(get_y_choord(omp_get_thread_num(), dim),
|
distribute_dim(get_x_choord(tid, dim), dim[0], imax);
|
||||||
dim[1],
|
const int ljmax = lj_start +
|
||||||
solver->jmax);
|
distribute_dim(get_y_choord(tid, dim), dim[1], jmax);
|
||||||
int limax = li_start + distribute_dim(get_x_choord(omp_get_thread_num(), dim),
|
|
||||||
dim[0],
|
|
||||||
solver->imax);
|
int jsw = ((li_start) % 2 == 0) == ((lj_start) % 2 == 0) ? 1 : 2;
|
||||||
int ljmax = lj_start + distribute_dim(get_y_choord(omp_get_thread_num(), dim),
|
|
||||||
dim[1],
|
for (int pass = 0; pass < 2; ++pass) {
|
||||||
solver->jmax);
|
int isw = jsw;
|
||||||
jsw = ((li_start) % 2 == 0) == ((lj_start) % 2 == 0) ? 1 : 2;
|
for (int i = li_start + 1; i < limax + 1; ++i) {
|
||||||
for (int pass = 0; pass < 2; pass++) {
|
|
||||||
isw = jsw;
|
|
||||||
for (int i = li_start + 1; i < limax + 1; i++) {
|
|
||||||
for (int j = lj_start + isw; j < ljmax + 1; j += 2) {
|
for (int j = lj_start + isw; j < ljmax + 1; j += 2) {
|
||||||
|
|
||||||
double r = RHS(i, j) -
|
double r = RHS(i, j) -
|
||||||
((P(i + 1, j) - 2.0 * P(i, j) + P(i - 1, j)) * idx2 +
|
((P(i + 1, j) - 2.0 * P(i, j) + P(i - 1, j)) * idx2 +
|
||||||
(P(i, j + 1) - 2.0 * P(i, j) + P(i, j - 1)) *
|
(P(i, j + 1) - 2.0 * P(i, j) + P(i, j - 1)) *
|
||||||
idy2);
|
idy2);
|
||||||
|
|
||||||
P(i, j) -= (factor * r);
|
P(i, j) -= factor * r;
|
||||||
res += (r * r);
|
res += r * r; /* reduction variable */
|
||||||
}
|
}
|
||||||
isw = 3 - isw;
|
isw = 3 - isw;
|
||||||
}
|
}
|
||||||
|
#pragma omp barrier
|
||||||
jsw = 3 - jsw;
|
jsw = 3 - jsw;
|
||||||
}
|
}
|
||||||
#pragma omp critical
|
if (lj_start == 0)
|
||||||
{
|
for (int i = li_start + 1; i < limax + 1; i++)
|
||||||
res += local_res;
|
P(i, 0) = P(i, 1);
|
||||||
}
|
if (ljmax == jmax)
|
||||||
}
|
for (int i = li_start + 1; i < limax + 1; i++)
|
||||||
#pragma omp parallel for
|
P(i, ljmax + 1) = P(i, ljmax);
|
||||||
for (int i = 1; i < imax + 1; i++) {
|
if (li_start == 0)
|
||||||
P(i, 0) = P(i, 1);
|
for (int j = lj_start + 1; j < ljmax + 1; j++)
|
||||||
P(i, jmax + 1) = P(i, jmax);
|
P(0, j) = P(1, j);
|
||||||
}
|
if (limax == imax)
|
||||||
#pragma omp parallel for
|
for (int j = lj_start + 1; j < ljmax + 1; j++)
|
||||||
for (int j = 1; j < jmax + 1; j++) {
|
P(limax + 1, j) = P(limax, j);
|
||||||
P(0, j) = P(1, j);
|
|
||||||
P(imax + 1, j) = P(imax, j);
|
}
|
||||||
}
|
|
||||||
|
res /= (double)(imax * jmax);
|
||||||
|
|
||||||
res = res / (double)(imax * jmax);
|
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
printf("%d Residuum: %e\n", it, res);
|
printf("%d Residuum: %e\n", it, res);
|
||||||
#endif
|
#endif
|
||||||
it++;
|
|
||||||
|
if (res < epssq) {
|
||||||
|
printf("Solver took %d iterations to reach %e\n", it + 1, sqrt(res));
|
||||||
|
return;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
printf("Solver took %d iterations to reach %f\n", it, sqrt(res));
|
|
||||||
|
printf("Solver reached itermax (%d) with residual %e\n", itermax, sqrt(res));
|
||||||
}
|
}
|
||||||
|
|
||||||
void writeResult(Solver* solver, char* filename)
|
void writeResult(Solver* solver, char* filename)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user