checkpoint 2D-omp: Improved performance

This commit is contained in:
indiano 2025-05-05 18:23:01 +02:00
parent 5dd7f83dc5
commit 623b866f00
2 changed files with 72 additions and 64 deletions

View File

@ -25,7 +25,6 @@ int main(int argc, char** argv)
Parameter params; Parameter params;
Solver solver; Solver solver;
initParameter(&params); initParameter(&params);
LIKWID_MARKER_INIT;
#pragma omp parallel #pragma omp parallel
{ {
if (dummy == 1 || omp_get_thread_num() == 0) if (dummy == 1 || omp_get_thread_num() == 0)
@ -39,10 +38,11 @@ int main(int argc, char** argv)
readParameter(&params, argv[1]); readParameter(&params, argv[1]);
initSolver(&solver, &params, 2); initSolver(&solver, &params, 2);
LIKWID_PROFILE("RB", solveRB); startTime = getTimeStamp();
solveRB(&solver);
endTime = getTimeStamp();
printf(" %.2fs\n", endTime - startTime); printf(" %.2fs\n", endTime - startTime);
writeResult(&solver, "p.dat"); writeResult(&solver, "p.dat");
LIKWID_MARKER_CLOSE;
return EXIT_SUCCESS; return EXIT_SUCCESS;
} }

View File

@ -116,87 +116,95 @@ void initSolver(Solver* solver, Parameter* params, int problem)
void solveRB(Solver* solver) void solveRB(Solver* solver)
{ {
int imax = solver->imax;
int jmax = solver->jmax; const int imax = solver->imax;
double eps = solver->eps; const int jmax = solver->jmax;
int itermax = solver->itermax; const int itermax = solver->itermax;
double dx2 = solver->dx * solver->dx; const double epssq = solver->eps * solver->eps;
double dy2 = solver->dy * solver->dy;
double idx2 = 1.0 / dx2; const double dx2 = solver->dx * solver->dx;
double idy2 = 1.0 / dy2; const double dy2 = solver->dy * solver->dy;
double factor = solver->omega * 0.5 * (dx2 * dy2) / (dx2 + dy2); const double idx2 = 1.0 / dx2;
double* p = solver->p; const double idy2 = 1.0 / dy2;
double* rhs = solver->rhs; const double factor = solver->omega * 0.5 * (dx2 * dy2) / (dx2 + dy2);
double epssq = eps * eps;
int it = 0; double* __restrict p = solver->p;
double res = 1.0; double* __restrict rhs = solver->rhs;
int dim[2] = { 0 };
int num_threads = 1; int dim[2] = { 0 };
#pragma omp parallel #pragma omp parallel
#pragma omp single
{ {
#pragma omp critical omp_create_dim(omp_get_num_threads(), dim);
num_threads = omp_get_num_threads();
} }
omp_create_dim(num_threads, dim);
printf("%d: { %d, %d}\n", num_threads, dim[0], dim[1]); double res = 0.0;
while ((res >= epssq) && (it < itermax)) {
#pragma omp parallel for (int it = 0; it < itermax; ++it) {
res = 0.0;
#pragma omp parallel reduction(+ : res)
{ {
res = 0.0; const int tid = omp_get_thread_num();
int jsw, isw;
double local_res = 0.0; const int li_start = get_dim_start(get_x_choord(tid, dim), dim[0], imax);
int li_start = get_dim_start(get_x_choord(omp_get_thread_num(), dim), const int lj_start = get_dim_start(get_y_choord(tid, dim), dim[1], jmax);
dim[0],
solver->imax); const int limax = li_start +
int lj_start = get_dim_start(get_y_choord(omp_get_thread_num(), dim), distribute_dim(get_x_choord(tid, dim), dim[0], imax);
dim[1], const int ljmax = lj_start +
solver->jmax); distribute_dim(get_y_choord(tid, dim), dim[1], jmax);
int limax = li_start + distribute_dim(get_x_choord(omp_get_thread_num(), dim),
dim[0],
solver->imax); int jsw = ((li_start) % 2 == 0) == ((lj_start) % 2 == 0) ? 1 : 2;
int ljmax = lj_start + distribute_dim(get_y_choord(omp_get_thread_num(), dim),
dim[1], for (int pass = 0; pass < 2; ++pass) {
solver->jmax); int isw = jsw;
jsw = ((li_start) % 2 == 0) == ((lj_start) % 2 == 0) ? 1 : 2; for (int i = li_start + 1; i < limax + 1; ++i) {
for (int pass = 0; pass < 2; pass++) {
isw = jsw;
for (int i = li_start + 1; i < limax + 1; i++) {
for (int j = lj_start + isw; j < ljmax + 1; j += 2) { for (int j = lj_start + isw; j < ljmax + 1; j += 2) {
double r = RHS(i, j) - double r = RHS(i, j) -
((P(i + 1, j) - 2.0 * P(i, j) + P(i - 1, j)) * idx2 + ((P(i + 1, j) - 2.0 * P(i, j) + P(i - 1, j)) * idx2 +
(P(i, j + 1) - 2.0 * P(i, j) + P(i, j - 1)) * (P(i, j + 1) - 2.0 * P(i, j) + P(i, j - 1)) *
idy2); idy2);
P(i, j) -= (factor * r); P(i, j) -= factor * r;
res += (r * r); res += r * r; /* reduction variable */
} }
isw = 3 - isw; isw = 3 - isw;
} }
#pragma omp barrier
jsw = 3 - jsw; jsw = 3 - jsw;
} }
#pragma omp critical if (lj_start == 0)
{ for (int i = li_start + 1; i < limax + 1; i++)
res += local_res; P(i, 0) = P(i, 1);
} if (ljmax == jmax)
} for (int i = li_start + 1; i < limax + 1; i++)
#pragma omp parallel for P(i, ljmax + 1) = P(i, ljmax);
for (int i = 1; i < imax + 1; i++) { if (li_start == 0)
P(i, 0) = P(i, 1); for (int j = lj_start + 1; j < ljmax + 1; j++)
P(i, jmax + 1) = P(i, jmax); P(0, j) = P(1, j);
} if (limax == imax)
#pragma omp parallel for for (int j = lj_start + 1; j < ljmax + 1; j++)
for (int j = 1; j < jmax + 1; j++) { P(limax + 1, j) = P(limax, j);
P(0, j) = P(1, j);
P(imax + 1, j) = P(imax, j); }
}
res /= (double)(imax * jmax);
res = res / (double)(imax * jmax);
#ifdef DEBUG #ifdef DEBUG
printf("%d Residuum: %e\n", it, res); printf("%d Residuum: %e\n", it, res);
#endif #endif
it++;
if (res < epssq) {
printf("Solver took %d iterations to reach %e\n", it + 1, sqrt(res));
return;
}
} }
printf("Solver took %d iterations to reach %f\n", it, sqrt(res));
printf("Solver reached itermax (%d) with residual %e\n", itermax, sqrt(res));
} }
void writeResult(Solver* solver, char* filename) void writeResult(Solver* solver, char* filename)