From 512e2903c8c5994fa7e8447a8b82c33591b4b30d Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Mon, 6 Jan 2025 12:17:19 +0100 Subject: [PATCH] Seuquential version finished. Start with MPI stuff Does not compile. --- Makefile | 2 +- config.mk | 2 + hpcg.par | 11 + mk/include_CLANG.mk | 12 +- src/comm.c | 586 ++++++++++++++++++++++++++++++++++++++++---- src/comm.h | 40 ++- src/main.c | 108 +++++--- src/matrix.c | 2 +- src/matrix.h | 5 +- src/parameter.c | 68 +---- src/parameter.h | 19 +- src/progress.c | 15 +- src/solver.c | 179 +++++++++++++- src/solver.h | 28 ++- 14 files changed, 874 insertions(+), 203 deletions(-) create mode 100644 hpcg.par diff --git a/Makefile b/Makefile index 8b8c60a..d2dac46 100644 --- a/Makefile +++ b/Makefile @@ -27,7 +27,7 @@ clist = $(subst $(eval) ,$c,$(strip $1)) define CLANGD_TEMPLATE CompileFlags: - Add: [$(call clist,$(INCLUDES)), $(call clist,$(CPPFLAGS))] + Add: [$(call clist,$(INCLUDES)), $(call clist,$(DEFINES)), $(call clist,$(OPTIONS)), -xc] Compiler: clang endef diff --git a/config.mk b/config.mk index 7d53436..4142421 100644 --- a/config.mk +++ b/config.mk @@ -1,9 +1,11 @@ # Supported: GCC, CLANG, ICC TOOLCHAIN ?= CLANG +ENABLE_MPI ?= true # ENABLE_OPENMP ?= false #Feature options OPTIONS += -DARRAY_ALIGNMENT=64 +#OPTIONS += -DVERBOSE #OPTIONS += -DVERBOSE_AFFINITY #OPTIONS += -DVERBOSE_DATASIZE #OPTIONS += -DVERBOSE_TIMER diff --git a/hpcg.par b/hpcg.par new file mode 100644 index 0000000..ce11b32 --- /dev/null +++ b/hpcg.par @@ -0,0 +1,11 @@ +#============================================================================== +# HPCG 27pt stencil +#============================================================================== + +filename generate +nx 50 +ny 50 +nz 50 + +itermax 10000 +eps 0.0001 diff --git a/mk/include_CLANG.mk b/mk/include_CLANG.mk index 429894d..0030907 100644 --- a/mk/include_CLANG.mk +++ b/mk/include_CLANG.mk @@ -1,7 +1,13 @@ +ifeq ($(strip $(ENABLE_MPI)),true) CC = mpicc +DEFINES = -D_MPI +else +CC = clang +endif + LD = $(CC) -ifeq ($(ENABLE_OPENMP),true) +ifeq ($(strip $(ENABLE_OPENMP)),true) OPENMP = -fopenmp #OPENMP = -Xpreprocessor -fopenmp #required on Macos with homebrew libomp LIBS = # -lomp @@ -11,5 +17,5 @@ VERSION = --version CFLAGS = -Ofast -std=c99 $(OPENMP) #CFLAGS = -Ofast -fnt-store=aggressive -std=c99 $(OPENMP) #AMD CLANG LFLAGS = $(OPENMP) -DEFINES = -D_GNU_SOURCE -INCLUDES = -I~/.local/include +DEFINES += -D_GNU_SOURCE +INCLUDES = -I/Users/jan/.local/include diff --git a/src/comm.c b/src/comm.c index fab8f98..d071965 100644 --- a/src/comm.c +++ b/src/comm.c @@ -1,14 +1,23 @@ -/* - * Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +/* Copyright (C) NHR@FAU, University Erlangen-Nuremberg. * All rights reserved. This file is part of nusif-solver. * Use of this source code is governed by a MIT style * license that can be found in the LICENSE file. */ +#include "util.h" #include #include +#ifdef _MPI +#include +#endif + +#include "allocate.h" #include "comm.h" +#define MAX_EXTERNAL 100000 +#define MAX_NUM_MESSAGES 500 +#define MAX_NUM_NEIGHBOURS MAX_NUM_MESSAGES + // subroutines local to this module int sizeOfRank(int rank, int size, int N) { @@ -26,6 +35,528 @@ void commReduction(double* v, int op) #endif } +void commPartition(Comm* c, Matrix* A) +{ + int rank = c->rank; + int size = c->size; + MPI_Comm comm = c->comm; + + // Extract Matrix pieces + int start_row = A->startRow; + int stop_row = A->stopRow; + int total_nrow = A->totalNr; + long long total_nnz = A->totalNnz; + int local_nrow = A->nr; + int local_nnz = A->nnz; + int* row_ptr = (int*)A->rowPtr; + int* col_ind = (int*)A->colInd; + + // int* nnz_in_row = A->nnz_in_row; + // double** ptr_to_vals_in_row = A->ptr_to_vals_in_row; + // int** ptr_to_inds_in_row = A->ptr_to_inds_in_row; + + // We need to convert the index values for the rows on this processor + // to a local index space. We need to: + // - Determine if each index reaches to a local value or external value + // - If local, subtract start_row from index value to get local index + // - If external, find out if it is already accounted for. + // - If so, then do nothing, + // - otherwise + // - add it to the list of external indices, + // - find out which processor owns the value. + // - Set up communication for sparse MV operation. + + /////////////////////////////////////////// + // Scan the indices and transform to local + /////////////////////////////////////////// + + int* external_index = (int*)allocate(ARRAY_ALIGNMENT, MAX_EXTERNAL * sizeof(int)); + int* externals = (int*)allocate(ARRAY_ALIGNMENT, A->totalNr * sizeof(int)); + int num_external = 1; + + c->external_index = external_index; + + for (int i = 0; i < A->totalNr; i++) { + externals[i] = -1; + } + + for (int i = 0; i < local_nrow; i++) { + for (int j = row_ptr[i]; j < row_ptr[i + 1]; j++) { + int cur_ind = A->colInd[j]; + +#ifdef VERBOSE + printf("Process %d of %d getting index %d in local row %d\n", + rank, + size, + cur_ind, + i); +#endif + + // shift local rows to the start + if (start_row <= cur_ind && cur_ind <= stop_row) { + col_ind[j] -= start_row; + } else // Must find out if we have already set up this point + { + if (externals[cur_ind] == -1) { + externals[cur_ind] = num_external++; + + if (num_external <= MAX_EXTERNAL) { + external_index[num_external - 1] = cur_ind; + // Mark index as external by negating it + ptr_to_inds_in_row[i][j] = -(ptr_to_inds_in_row[i][j] + 1); + } else { + cerr << "Must increase MAX_EXTERNAL in HPC_Sparse_Matrix.hpp" + << endl; + abort(); + } + } else { + // Mark index as external by adding 1 and negating it + ptr_to_inds_in_row[i][j] = -(ptr_to_inds_in_row[i][j] + 1); + } + } + } + } + + //////////////////////////////////////////////////////////////////////////// + // Go through list of externals to find out which processors must be accessed. + //////////////////////////////////////////////////////////////////////////// + + A->num_external = num_external; + int* tmp_buffer = new int[size]; // Temp buffer space needed below + + // Build list of global index offset + + int* global_index_offsets = new int[size]; + for (i = 0; i < size; i++) + tmp_buffer[i] = 0; // First zero out + + tmp_buffer[rank] = start_row; // This is my start row + + // This call sends the start_row of each ith processor to the ith + // entry of global_index_offset on all processors. + // Thus, each processor know the range of indices owned by all + // other processors. + // Note: There might be a better algorithm for doing this, but this + // will work... + + MPI_Allreduce(tmp_buffer, + global_index_offsets, + size, + MPI_INT, + MPI_SUM, + MPI_COMM_WORLD); + + // Go through list of externals and find the processor that owns each + int* external_processor = new int[num_external]; + int* new_external_processor = new int[num_external]; + + for (i = 0; i < num_external; i++) { + int cur_ind = external_index[i]; + for (int j = size - 1; j >= 0; j--) + if (global_index_offsets[j] <= cur_ind) { + external_processor[i] = j; + break; + } + } + if (debug) { + t0 = mytimer() - t0; + cout << " Time in finding processors phase = " << t0 << endl; + } + + //////////////////////////////////////////////////////////////////////////// + // Sift through the external elements. For each newly encountered external + // point assign it the next index in the sequence. Then look for other + // external elements who are update by the same node and assign them the next + // set of index numbers in the sequence (ie. elements updated by the same node + // have consecutive indices). + //////////////////////////////////////////////////////////////////////////// + + if (debug) t0 = mytimer(); + + int count = local_nrow; + for (i = 0; i < num_external; i++) + external_local_index[i] = -1; + + for (i = 0; i < num_external; i++) { + if (external_local_index[i] == -1) { + external_local_index[i] = count++; + + for (j = i + 1; j < num_external; j++) { + if (external_processor[j] == external_processor[i]) + external_local_index[j] = count++; + } + } + } + + if (debug) { + t0 = mytimer() - t0; + cout << " Time in scanning external indices phase = " << t0 << endl; + } + if (debug) t0 = mytimer(); + + for (i = 0; i < local_nrow; i++) { + for (j = 0; j < nnz_in_row[i]; j++) { + if (ptr_to_inds_in_row[i][j] < 0) // Change index values of externals + { + int cur_ind = -ptr_to_inds_in_row[i][j] - 1; + ptr_to_inds_in_row[i][j] = external_local_index[externals[cur_ind]]; + } + } + } + + for (i = 0; i < num_external; i++) + new_external_processor[i] = 0; + + for (i = 0; i < num_external; i++) + new_external_processor[external_local_index[i] - local_nrow] = + external_processor[i]; + + if (debug) { + t0 = mytimer() - t0; + cout << " Time in assigning external indices phase = " << t0 << endl; + } + + if (debug_details) { + for (i = 0; i < num_external; i++) { + cout << "Processor " << rank << " of " << size << ": external processor[" << i + << "] = " << external_processor[i] << endl; + cout << "Processor " << rank << " of " << size << ": new external processor[" + << i << "] = " << new_external_processor[i] << endl; + } + } + + //////////////////////////////////////////////////////////////////////////// + /// + // Count the number of neighbors from which we receive information to update + // our external elements. Additionally, fill the array tmp_neighbors in the + // following way: + // tmp_neighbors[i] = 0 ==> No external elements are updated by + // processor i. + // tmp_neighbors[i] = x ==> (x-1)/size elements are updated from + // processor i. + /// + //////////////////////////////////////////////////////////////////////////// + + t0 = mytimer(); + int* tmp_neighbors = new int[size]; + for (i = 0; i < size; i++) + tmp_neighbors[i] = 0; + + int num_recv_neighbors = 0; + int length = 1; + + for (i = 0; i < num_external; i++) { + if (tmp_neighbors[new_external_processor[i]] == 0) { + num_recv_neighbors++; + tmp_neighbors[new_external_processor[i]] = 1; + } + tmp_neighbors[new_external_processor[i]] += size; + } + + /// sum over all processors all the tmp_neighbors arrays /// + + MPI_Allreduce(tmp_neighbors, tmp_buffer, size, MPI_INT, MPI_SUM, MPI_COMM_WORLD); + + /// decode the combined 'tmp_neighbors' (stored in tmp_buffer) + // array from all the processors + + int num_send_neighbors = tmp_buffer[rank] % size; + + /// decode 'tmp_buffer[rank] to deduce total number of elements + // we must send + + int total_to_be_sent = (tmp_buffer[rank] - num_send_neighbors) / size; + + // + // Check to see if we have enough workspace allocated. This could be + // dynamically modified, but let's keep it simple for now... + // + + if (num_send_neighbors > MAX_NUM_MESSAGES) { + cerr << "Must increase MAX_NUM_MESSAGES in HPC_Sparse_Matrix.hpp" << endl; + cerr << "Must be at least " << num_send_neighbors << endl; + abort(); + } + + if (total_to_be_sent > MAX_EXTERNAL) { + cerr << "Must increase MAX_EXTERNAL in HPC_Sparse_Matrix.hpp" << endl; + cerr << "Must be at least " << total_to_be_sent << endl; + abort(); + } + delete[] tmp_neighbors; + + if (debug) { + t0 = mytimer() - t0; + cout << " Time in finding neighbors phase = " << t0 << endl; + } + if (debug) + cout << "Processor " << rank << " of " << size + << ": Number of send neighbors = " << num_send_neighbors << endl; + + if (debug) + cout << "Processor " << rank << " of " << size + << ": Number of receive neighbors = " << num_recv_neighbors << endl; + + if (debug) + cout << "Processor " << rank << " of " << size + << ": Total number of elements to send = " << total_to_be_sent << endl; + + if (debug) MPI_Barrier(MPI_COMM_WORLD); + + ///////////////////////////////////////////////////////////////////////// + /// + // Make a list of the neighbors that will send information to update our + // external elements (in the order that we will receive this information). + /// + ///////////////////////////////////////////////////////////////////////// + + int* recv_list = new int[MAX_EXTERNAL]; + + j = 0; + recv_list[j++] = new_external_processor[0]; + for (i = 1; i < num_external; i++) { + if (new_external_processor[i - 1] != new_external_processor[i]) { + recv_list[j++] = new_external_processor[i]; + } + } + + // + // Send a 0 length message to each of our recv neighbors + // + + int* send_list = new int[num_send_neighbors]; + for (i = 0; i < num_send_neighbors; i++) + send_list[i] = 0; + + // + // first post receives, these are immediate receives + // Do not wait for result to come, will do that at the + // wait call below. + // + int MPI_MY_TAG = 99; + + MPI_Request* request = new MPI_Request[MAX_NUM_MESSAGES]; + for (i = 0; i < num_send_neighbors; i++) { + MPI_Irecv(tmp_buffer + i, + 1, + MPI_INT, + MPI_ANY_SOURCE, + MPI_MY_TAG, + MPI_COMM_WORLD, + request + i); + } + + // send messages + + for (i = 0; i < num_recv_neighbors; i++) + MPI_Send(tmp_buffer + i, 1, MPI_INT, recv_list[i], MPI_MY_TAG, MPI_COMM_WORLD); + /// + // Receive message from each send neighbor to construct 'send_list'. + /// + + MPI_Status status; + for (i = 0; i < num_send_neighbors; i++) { + if (MPI_Wait(request + i, &status)) { + cerr << "MPI_Wait error\n" << endl; + exit(-1); + } + send_list[i] = status.MPI_SOURCE; + } + + ///////////////////////////////////////////////////////////////////////// + /// + // Compare the two lists. In most cases they should be the same. + // However, if they are not then add new entries to the recv list + // that are in the send list (but not already in the recv list). + /// + ///////////////////////////////////////////////////////////////////////// + + for (j = 0; j < num_send_neighbors; j++) { + int found = 0; + for (i = 0; i < num_recv_neighbors; i++) { + if (recv_list[i] == send_list[j]) found = 1; + } + + if (found == 0) { + if (debug) + cout << "Processor " << rank << " of " << size << ": recv_list[" + << num_recv_neighbors << "] = " << send_list[j] << endl; + recv_list[num_recv_neighbors] = send_list[j]; + (num_recv_neighbors)++; + } + } + + delete[] send_list; + num_send_neighbors = num_recv_neighbors; + + if (num_send_neighbors > MAX_NUM_MESSAGES) { + cerr << "Must increase MAX_EXTERNAL in HPC_Sparse_Matrix.hpp" << endl; + abort(); + } + + ///////////////////////////////////////////////////////////////////////// + /// Start filling HPC_Sparse_Matrix struct + ///////////////////////////////////////////////////////////////////////// + + A->total_to_be_sent = total_to_be_sent; + int* elements_to_send = new int[total_to_be_sent]; + A->elements_to_send = elements_to_send; + + for (i = 0; i < total_to_be_sent; i++) + elements_to_send[i] = 0; + + // + // Create 'new_external' which explicitly put the external elements in the + // order given by 'external_local_index' + // + + int* new_external = new int[num_external]; + for (i = 0; i < num_external; i++) { + new_external[external_local_index[i] - local_nrow] = external_index[i]; + } + + ///////////////////////////////////////////////////////////////////////// + // + // Send each processor the global index list of the external elements in the + // order that I will want to receive them when updating my external elements + // + ///////////////////////////////////////////////////////////////////////// + + int* lengths = new int[num_recv_neighbors]; + + MPI_MY_TAG++; + + // First post receives + + for (i = 0; i < num_recv_neighbors; i++) { + int partner = recv_list[i]; + MPI_Irecv(lengths + i, + 1, + MPI_INT, + partner, + MPI_MY_TAG, + MPI_COMM_WORLD, + request + i); + } + + int* neighbors = new int[MAX_NUM_NEIGHBOURS]; + int* recv_length = new int[MAX_NUM_NEIGHBOURS]; + int* send_length = new int[MAX_NUM_NEIGHBOURS]; + + A->neighbors = neighbors; + A->recv_length = recv_length; + A->send_length = send_length; + + j = 0; + for (i = 0; i < num_recv_neighbors; i++) { + int start = j; + int newlength = 0; + + // go through list of external elements until updating + // processor changes + + while ((j < num_external) && (new_external_processor[j] == recv_list[i])) { + newlength++; + j++; + if (j == num_external) break; + } + + recv_length[i] = newlength; + neighbors[i] = recv_list[i]; + + length = j - start; + MPI_Send(&length, 1, MPI_INT, recv_list[i], MPI_MY_TAG, MPI_COMM_WORLD); + } + + // Complete the receives of the number of externals + + for (i = 0; i < num_recv_neighbors; i++) { + if (MPI_Wait(request + i, &status)) { + cerr << "MPI_Wait error\n" << endl; + exit(-1); + } + send_length[i] = lengths[i]; + } + delete[] lengths; + + /////////////////////////////////////////////////////////////////// + // Build "elements_to_send" list. These are the x elements I own + // that need to be sent to other processors. + /////////////////////////////////////////////////////////////////// + + MPI_MY_TAG++; + + j = 0; + for (i = 0; i < num_recv_neighbors; i++) { + MPI_Irecv(elements_to_send + j, + send_length[i], + MPI_INT, + neighbors[i], + MPI_MY_TAG, + MPI_COMM_WORLD, + request + i); + j += send_length[i]; + } + + j = 0; + for (i = 0; i < num_recv_neighbors; i++) { + int start = j; + int newlength = 0; + + // Go through list of external elements + // until updating processor changes. This is redundant, but + // saves us from recording this information. + + while ((j < num_external) && (new_external_processor[j] == recv_list[i])) { + + newlength++; + j++; + if (j == num_external) break; + } + MPI_Send(new_external + start, + j - start, + MPI_INT, + recv_list[i], + MPI_MY_TAG, + MPI_COMM_WORLD); + } + + // receive from each neighbor the global index list of external elements + + for (i = 0; i < num_recv_neighbors; i++) { + if (MPI_Wait(request + i, &status)) { + cerr << "MPI_Wait error\n" << endl; + exit(-1); + } + } + + /// replace global indices by local indices /// + + for (i = 0; i < total_to_be_sent; i++) + elements_to_send[i] -= start_row; + + //////////////// + // Finish up !! + //////////////// + + A->num_send_neighbors = num_send_neighbors; + A->local_ncol = A->local_nrow + num_external; + + // Used in exchange_externals + double* send_buffer = new double[total_to_be_sent]; + A->send_buffer = send_buffer; + + delete[] tmp_buffer; + delete[] global_index_offsets; + delete[] recv_list; + delete[] external_processor; + delete[] new_external; + delete[] new_external_processor; + delete[] request; + + return; +} + void commPrintConfig(Comm* c) { #ifdef _MPI @@ -70,57 +601,6 @@ void commInit(Comm* c, int argc, char** argv) #endif } -void commTestInit(Comm* c, double* p, double* f, double* g) -{ - int imax = c->imaxLocal; - int jmax = c->jmaxLocal; - int rank = c->rank; - - for (int j = 0; j < jmax + 2; j++) { - for (int i = 0; i < imax + 2; i++) { - p[j * (imax + 2) + i] = rank; - f[j * (imax + 2) + i] = rank; - g[j * (imax + 2) + i] = rank; - } - } -} - -static void testWriteFile(char* filename, double* grid, int imax, int jmax) -{ - FILE* fp = fopen(filename, "w"); - - if (fp == NULL) { - printf("Error!\n"); - exit(EXIT_FAILURE); - } - - for (int j = 0; j < jmax + 2; j++) { - for (int i = 0; i < imax + 2; i++) { - fprintf(fp, "%.2f ", grid[j * (imax + 2) + i]); - } - fprintf(fp, "\n"); - } - - fclose(fp); -} - -void commTestWrite(Comm* c, double* p, double* f, double* g) -{ - int imax = c->imaxLocal; - int jmax = c->jmaxLocal; - int rank = c->rank; - - char filename[30]; - snprintf(filename, 30, "ptest-%d.dat", rank); - testWriteFile(filename, p, imax, jmax); - - snprintf(filename, 30, "ftest-%d.dat", rank); - testWriteFile(filename, f, imax, jmax); - - snprintf(filename, 30, "gtest-%d.dat", rank); - testWriteFile(filename, g, imax, jmax); -} - void commFinalize(Comm* c) { #ifdef _MPI diff --git a/src/comm.h b/src/comm.h index c1bf93f..dfe296e 100644 --- a/src/comm.h +++ b/src/comm.h @@ -8,10 +8,8 @@ #include #endif -enum direction { LEFT = 0, RIGHT, BOTTOM, TOP, NDIRS }; -enum dimension { IDIM = 0, JDIM, NDIMS }; -enum cdimension { CJDIM = 0, CIDIM }; -enum layer { HALO = 0, BULK }; +#include "matrix.h" + enum op { MAX = 0, SUM }; typedef struct { @@ -19,37 +17,27 @@ typedef struct { int size; #if defined(_MPI) MPI_Comm comm; - MPI_Datatype bufferTypes[NDIRS]; - MPI_Aint sdispls[NDIRS]; - MPI_Aint rdispls[NDIRS]; + + int num_external; + int num_send_neighbors; + int* external_index; + int* external_local_index; + int total_to_be_sent; + int* elements_to_send; + int* neighbors; + int* recv_length; + int* send_length; + double* send_buffer; #endif - int neighbours[NDIRS]; - int coords[NDIMS], dims[NDIMS]; - int imaxLocal, jmaxLocal; } Comm; extern int sizeOfRank(int rank, int size, int N); extern void commInit(Comm* c, int argc, char** argv); -extern void commTestInit(Comm* c, double* p, double* f, double* g); -extern void commTestWrite(Comm* c, double* p, double* f, double* g); extern void commFinalize(Comm* c); -extern void commPartition(Comm* c, int jmax, int imax); +extern void commPartition(Comm*, Matrix* m); extern void commPrintConfig(Comm*); extern void commExchange(Comm*, double*); -extern void commShift(Comm* c, double* f, double* g); extern void commReduction(double* v, int op); -extern int commIsBoundary(Comm* c, int direction); -extern void commUpdateDatatypes(Comm*, Comm*, int, int); -extern void commFreeCommunicator(Comm*); -extern void commCollectResult(Comm* c, - double* ug, - double* vg, - double* pg, - double* u, - double* v, - double* p, - int jmax, - int imax); static inline int commIsMaster(Comm* c) { return c->rank == 0; } #endif // __COMM_H_ diff --git a/src/main.c b/src/main.c index 77a8fee..c700a7a 100644 --- a/src/main.c +++ b/src/main.c @@ -2,61 +2,111 @@ * All rights reserved. * Use of this source code is governed by a MIT-style * license that can be found in the LICENSE file.*/ +#include #include #include #include +#include "allocate.h" #include "comm.h" #include "matrix.h" -#include "progress.h" +#include "parameter.h" #include "solver.h" #include "timing.h" +#include "util.h" + +CG_FLOAT compute_residual(Solver* s) +{ + CG_FLOAT residual = 0.0; + int n = s->A.nr; + CG_FLOAT* v1 = s->x; + CG_FLOAT* v2 = s->xexact; + + for (int i = 0; i < n; i++) { + double diff = fabs(v1[i] - v2[i]); + if (diff > residual) residual = diff; + } + + commReduction(&residual, MAX); + + return residual; +} int main(int argc, char** argv) { int rank; double timeStart, timeStop; - Matrix m; + Parameter param; Solver s; + Comm comm; - // commInit(s.comm, argc, argv); - - // FILE* fp; - // if (commIsMaster(s.comm)) fp = initResidualWriter(); + commInit(&comm, argc, argv); + initParameter(¶m); if (argc != 2) { printf("Usage: %s \n", argv[0]); exit(EXIT_SUCCESS); } - matrixRead(&m, argv[1]); - matrixDump(&m); - exit(EXIT_SUCCESS); + readParameter(¶m, argv[1]); - double te = 1000; - double t = 0.0; - double res = 0.0; + CG_FLOAT eps = (CG_FLOAT)param.eps; + int itermax = param.itermax; + initSolver(&s, ¶m); + // matrixDump(&s.A); + CG_UINT nrow = s.A.nr; + CG_UINT ncol = s.A.nc; + CG_FLOAT* r = (CG_FLOAT*)allocate(64, nrow * sizeof(CG_FLOAT)); + CG_FLOAT* p = (CG_FLOAT*)allocate(64, ncol * sizeof(CG_FLOAT)); + CG_FLOAT* Ap = (CG_FLOAT*)allocate(64, nrow * sizeof(CG_FLOAT)); + CG_FLOAT normr = 0.0; + CG_FLOAT rtrans = 0.0, oldrtrans; + + waxpby(nrow, 1.0, s.x, 0.0, s.x, p); + spMVM(&s.A, p, Ap); + waxpby(nrow, 1.0, s.b, -1.0, Ap, r); + ddot(nrow, r, r, &rtrans); + + normr = sqrt(rtrans); + + // initial iteration + waxpby(nrow, 1.0, r, 0.0, r, p); + + // TICK(); exchange_externals(A,p); TOCK(t5); + spMVM(&s.A, p, Ap); + double alpha = 0.0; + ddot(nrow, p, Ap, &alpha); + alpha = rtrans / alpha; + waxpby(nrow, 1.0, s.x, alpha, p, s.x); + waxpby(nrow, 1.0, r, -alpha, Ap, r); + + int k; timeStart = getTimeStamp(); - while (t <= te) { - - // if (commIsMaster(s.comm)) writeResidual(fp, t, res); - - t += 0.5; - -#ifdef VERBOSE - if (commIsMaster(s.comm)) { - printf("TIME %f , TIMESTEP %f\n", t, d.dt); - } -#else - printProgress(t); -#endif + for (k = 1; k < itermax && normr > eps; k++) { + oldrtrans = rtrans; + ddot(nrow, r, r, &rtrans); + double beta = rtrans / oldrtrans; + waxpby(nrow, 1.0, r, beta, p, p); + // TICK(); exchange_externals(A,p); TOCK(t5); + spMVM(&s.A, p, Ap); + alpha = 0.0; + ddot(nrow, p, Ap, &alpha); + alpha = rtrans / alpha; + waxpby(nrow, 1.0, s.x, alpha, p, s.x); + waxpby(nrow, 1.0, r, -alpha, Ap, r); } timeStop = getTimeStamp(); - if (commIsMaster(s.comm)) { - printf("Solution took %.2fs\n", timeStop - timeStart); + + double residual = compute_residual(&s); + + if (commIsMaster(&comm)) { + printf("Solution performed %d iterations and took %.2fs\n", + k, + timeStop - timeStart); + printf("Difference between computed and exact = %f\n", residual); } - // if (commIsMaster(s.comm)) fclose(fp); - commFinalize(s.comm); + + commFinalize(&comm); return EXIT_SUCCESS; } diff --git a/src/matrix.c b/src/matrix.c index 393c5a4..ff6ed66 100644 --- a/src/matrix.c +++ b/src/matrix.c @@ -94,7 +94,7 @@ void matrixRead(Matrix* m, char* filename) Entry* mm; if (sym_flag) { - mm = (Entry*)allocate(64, nz * 2 * sizeof(Entry)); + mm = (Entry*)allocate(ARRAY_ALIGNMENT, nz * 2 * sizeof(Entry)); } else { mm = (Entry*)allocate(64, nz * sizeof(Entry)); } diff --git a/src/matrix.h b/src/matrix.h index 7f3a941..92b08eb 100644 --- a/src/matrix.h +++ b/src/matrix.h @@ -4,6 +4,7 @@ * license that can be found in the LICENSE file.*/ #ifndef __MATRIX_H_ #define __MATRIX_H_ +#include #include "util.h" @@ -14,7 +15,9 @@ typedef struct { } Entry; typedef struct { - CG_UINT nr, nnz; // number of rows and non zeros + CG_UINT nr, nc, nnz; // number of rows, columns and non zeros + CG_UINT totalNr, totalNnz; // number of rows and non zeros + CG_UINT startRow, stopRow; CG_UINT *colInd, *rowPtr; // colum Indices, row Pointer CG_FLOAT* val; } Matrix; diff --git a/src/parameter.c b/src/parameter.c index e5e7029..136a823 100644 --- a/src/parameter.c +++ b/src/parameter.c @@ -1,9 +1,7 @@ -/* - * Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +/* Copyright (C) NHR@FAU, University Erlangen-Nuremberg. * All rights reserved. This file is part of nusif-solver. * Use of this source code is governed by a MIT style - * license that can be found in the LICENSE file. - */ + * license that can be found in the LICENSE file. */ #include #include #include @@ -14,16 +12,12 @@ void initParameter(Parameter* param) { - param->xlength = 1.0; - param->ylength = 1.0; - param->imax = 100; - param->jmax = 100; - param->itermax = 1000; - param->eps = 0.0001; - param->omg = 1.8; - param->levels = 5; - param->presmooth = 5; - param->postsmooth = 5; + param->filename = "generate"; + param->nx = 100; + param->ny = 100; + param->nz = 1000; + param->itermax = 1000; + param->eps = 0.0001; } void readParameter(Parameter* param, const char* filename) @@ -56,31 +50,11 @@ void readParameter(Parameter* param, const char* filename) #define PARSE_REAL(p) PARSE_PARAM(p, atof) if (tok != NULL && val != NULL) { - PARSE_REAL(xlength); - PARSE_REAL(ylength); - PARSE_INT(imax); - PARSE_INT(jmax); + PARSE_INT(nx); + PARSE_INT(ny); + PARSE_INT(nz); PARSE_INT(itermax); PARSE_REAL(eps); - PARSE_REAL(omg); - PARSE_REAL(re); - PARSE_REAL(tau); - PARSE_REAL(gamma); - PARSE_REAL(dt); - PARSE_REAL(te); - PARSE_REAL(gx); - PARSE_REAL(gy); - PARSE_STRING(name); - PARSE_INT(bcLeft); - PARSE_INT(bcRight); - PARSE_INT(bcBottom); - PARSE_INT(bcTop); - PARSE_INT(levels); - PARSE_INT(presmooth); - PARSE_INT(postsmooth); - PARSE_REAL(u_init); - PARSE_REAL(v_init); - PARSE_REAL(p_init); } } @@ -89,26 +63,10 @@ void readParameter(Parameter* param, const char* filename) void printParameter(Parameter* param) { - printf("Parameters for %s\n", param->name); - printf("Boundary conditions Left:%d Right:%d Bottom:%d Top:%d\n", - param->bcLeft, - param->bcRight, - param->bcBottom, - param->bcTop); - printf("\tReynolds number: %.2f\n", param->re); - printf("\tInit arrays: U:%.2f V:%.2f P:%.2f\n", - param->u_init, - param->v_init, - param->p_init); + printf("Parameters\n"); printf("Geometry data:\n"); - printf("\tDomain box size (x, y): %.2f, %.2f\n", param->xlength, param->ylength); - printf("\tCells (x, y): %d, %d\n", param->imax, param->jmax); - printf("Timestep parameters:\n"); - printf("\tDefault stepsize: %.2f, Final time %.2f\n", param->dt, param->te); - printf("\tTau factor: %.2f\n", param->tau); + printf("\tPoints (x, y, z): %d, %d, %d\n", param->nx, param->ny, param->nz); printf("Iterative solver parameters:\n"); printf("\tMax iterations: %d\n", param->itermax); printf("\tepsilon (stopping tolerance) : %f\n", param->eps); - printf("\tgamma (stopping tolerance) : %f\n", param->gamma); - printf("\tomega (SOR relaxation): %f\n", param->omg); } diff --git a/src/parameter.h b/src/parameter.h index ed765eb..fed92aa 100644 --- a/src/parameter.h +++ b/src/parameter.h @@ -1,24 +1,15 @@ -/* - * Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +/* Copyright (C) NHR@FAU, University Erlangen-Nuremberg. * All rights reserved. This file is part of nusif-solver. * Use of this source code is governed by a MIT style - * license that can be found in the LICENSE file. - */ + * license that can be found in the LICENSE file. */ #ifndef __PARAMETER_H_ #define __PARAMETER_H_ typedef struct { - double xlength, ylength; - int imax, jmax; + char* filename; + int nx, ny, nz; int itermax; - double eps, omg; - double re, tau, gamma; - double te, dt; - double gx, gy; - char* name; - int bcLeft, bcRight, bcBottom, bcTop; - double u_init, v_init, p_init; - int levels, presmooth, postsmooth; + double eps; } Parameter; void initParameter(Parameter*); diff --git a/src/progress.c b/src/progress.c index 2358e43..44d5f39 100644 --- a/src/progress.c +++ b/src/progress.c @@ -1,15 +1,12 @@ -/* - * Copyright (C) NHR@FAU, University Erlangen-Nuremberg. +/* Copyright (C) NHR@FAU, University Erlangen-Nuremberg. * All rights reserved. This file is part of nusif-solver. * Use of this source code is governed by a MIT style - * license that can be found in the LICENSE file. - */ -#include -#include -#include -#include -#include + * license that can be found in the LICENSE file. */ #include "progress.h" +#include +#include +#include +#include static double _end; static int _current; diff --git a/src/solver.c b/src/solver.c index 556ad68..a2c7156 100644 --- a/src/solver.c +++ b/src/solver.c @@ -2,13 +2,147 @@ * All rights reserved. * Use of this source code is governed by a MIT-style * license that can be found in the LICENSE file.*/ +#include +#include +#include #include +#include +#include "allocate.h" +#include "comm.h" #include "matrix.h" #include "solver.h" #include "util.h" -void spMVM(Matrix* m, double* x, double* y) +static void matrixGenerate(Parameter* p, Solver* s, bool use_7pt_stencil) +{ + int size = 1; // Serial case (not using MPI) + int rank = 0; + + CG_UINT local_nrow = p->nx * p->ny * p->nz; + CG_UINT local_nnz = 27 * local_nrow; + + CG_UINT total_nrow = local_nrow * size; + CG_UINT total_nnz = 27 * total_nrow; + + int start_row = local_nrow * rank; + int stop_row = start_row + local_nrow - 1; + + if (use_7pt_stencil) { + printf("Generate 7pt matrix with "); + } else { + printf("Generate 27pt matrix with "); + } + printf("%d total rows and %d nonzeros\n", (int)total_nrow, (int)local_nnz); + + s->A.val = (CG_FLOAT*)allocate(64, local_nnz * sizeof(CG_FLOAT)); + s->A.colInd = (CG_UINT*)allocate(64, local_nnz * sizeof(CG_UINT)); + s->A.rowPtr = (CG_UINT*)allocate(64, (local_nrow + 1) * sizeof(CG_UINT)); + s->x = (CG_FLOAT*)allocate(64, local_nrow * sizeof(CG_FLOAT)); + s->b = (CG_FLOAT*)allocate(64, local_nrow * sizeof(CG_FLOAT)); + s->xexact = (CG_FLOAT*)allocate(64, local_nrow * sizeof(CG_FLOAT)); + + CG_FLOAT* curvalptr = s->A.val; + CG_UINT* curindptr = s->A.colInd; + CG_UINT* currowptr = s->A.rowPtr; + CG_FLOAT* x = s->x; + CG_FLOAT* b = s->b; + CG_FLOAT* xexact = s->xexact; + + CG_UINT nnzglobal = 0; + int nx = p->nx, ny = p->ny, nz = p->nz; + CG_UINT cursor = 0; + + // for (int i = 0; i < local_nnz; i++) { + // curvalptr[i] = 0.0; + // printf("%d-%f, ", i, m->val[i]); + // } + // printf("\n"); + *currowptr++ = 0; + + for (int iz = 0; iz < nz; iz++) { + for (int iy = 0; iy < ny; iy++) { + for (int ix = 0; ix < nx; ix++) { + + int curlocalrow = iz * nx * ny + iy * nx + ix; + int currow = start_row + iz * nx * ny + iy * nx + ix; + int nnzrow = 0; + + // (*A)->ptr_to_vals_in_row[curlocalrow] = curvalptr; + // (*A)->ptr_to_inds_in_row[curlocalrow] = curindptr; + + for (int sz = -1; sz <= 1; sz++) { + for (int sy = -1; sy <= 1; sy++) { + for (int sx = -1; sx <= 1; sx++) { + + int curcol = currow + sz * nx * ny + sy * nx + sx; + // Since we have a stack of nx by ny by nz domains + // , stacking in the z direction, we check to see + // if sx and sy are reaching outside of the domain, + // while the check for the curcol being valid is + // sufficient to check the z values + if ((ix + sx >= 0) && (ix + sx < nx) && (iy + sy >= 0) && + (iy + sy < ny) && (curcol >= 0 && curcol < total_nrow)) { + // This logic will skip over point that are not part of a + // 7-pt stencil + if (!use_7pt_stencil || + (sz * sz + sy * sy + sx * sx <= 1)) { + if (curcol == currow) { + *curvalptr++ = 27.0; + } else { + *curvalptr++ = -1.0; + } + *curindptr++ = curcol; + nnzrow++; + } + } + } // end sx loop + } // end sy loop + } // end sz loop + + *currowptr = *(currowptr - 1) + nnzrow; + // printf("%d:%d-%lld, ", currow, nnzrow, *currowptr); + currowptr++; + nnzglobal += nnzrow; + x[curlocalrow] = 0.0; + b[curlocalrow] = 27.0 - ((CG_FLOAT)(nnzrow - 1)); + xexact[curlocalrow] = 1.0; + } // end ix loop + } // end iy loop + } // end iz loop + +#ifdef VERBOSE + printf("Process %d of %d has %d rows\n", rank, size, local_nrow); + printf("Global rows %d through %d\n", start_row, stop_row); + printf("%d nonzeros\n", start_row, stop_row); +#endif /* ifdef VERBOSE */ + + // for (int i = 0; i < local_nnz; i++) { + // printf("%d:%f, ", (int)m->colInd[i], m->val[i]); + // } + // printf("\n"); + + s->A.startRow = start_row; + s->A.stopRow = stop_row; + s->A.totalNr = total_nrow; + s->A.totalNnz = total_nnz; + s->A.nr = local_nrow; + s->A.nc = local_nrow; + s->A.nnz = local_nnz; +} + +void initSolver(Solver* s, Parameter* p) +{ + if (!strcmp(p->filename, "generate")) { + matrixGenerate(p, s, false); + } else if (!strcmp(p->filename, "generate7P")) { + matrixGenerate(p, s, true); + } else { + matrixRead(&s->A, p->filename); + } +} + +void spMVM(Matrix* m, const CG_FLOAT* restrict x, CG_FLOAT* restrict y) { CG_UINT numRows = m->nr; CG_UINT* rowPtr = m->rowPtr; @@ -26,3 +160,46 @@ void spMVM(Matrix* m, double* x, double* y) y[rowID] = tmp; } } + +void waxpby(const CG_UINT n, + const CG_FLOAT alpha, + const CG_FLOAT* restrict x, + const CG_FLOAT beta, + const CG_FLOAT* restrict y, + CG_FLOAT* const w) +{ + if (alpha == 1.0) { + for (size_t i = 0; i < n; i++) { + w[i] = x[i] + beta * y[i]; + } + } else if (beta == 1.0) { + for (size_t i = 0; i < n; i++) { + w[i] = alpha * x[i] + y[i]; + } + } else { + for (size_t i = 0; i < n; i++) { + w[i] = alpha * x[i] + beta * y[i]; + } + } +} + +void ddot(const CG_UINT n, + const CG_FLOAT* restrict x, + const CG_FLOAT* restrict y, + CG_FLOAT* restrict result) +{ + CG_FLOAT sum = 0.0; + + if (y == x) { + for (size_t i = 0; i < n; i++) { + sum += x[i] * x[i]; + } + } else { + for (size_t i = 0; i < n; i++) { + sum += x[i] * y[i]; + } + } + + commReduction(&sum, SUM); + *result = sum; +} diff --git a/src/solver.h b/src/solver.h index be35da6..8a1b81c 100644 --- a/src/solver.h +++ b/src/solver.h @@ -4,21 +4,29 @@ * license that can be found in the LICENSE file. */ #ifndef __SOLVER_H_ #define __SOLVER_H_ -#include "comm.h" #include "matrix.h" #include "parameter.h" #include "util.h" typedef struct { - /* geometry and grid information */ - /* parameters */ - double eps, omega; - int itermax; - - /* communication */ - Comm* comm; + Matrix A; + CG_FLOAT* x; + CG_FLOAT* b; + CG_FLOAT* xexact; } Solver; -void initSolver(Solver*, Parameter*); -void spMVM(Matrix* m, CG_FLOAT* x, CG_FLOAT* y); +void initSolver(Solver* s, Parameter*); +void spMVM(Matrix* m, const CG_FLOAT* restrict x, CG_FLOAT* restrict y); + +void waxpby(const CG_UINT n, + const CG_FLOAT alpha, + const CG_FLOAT* restrict x, + const CG_FLOAT beta, + const CG_FLOAT* restrict y, + double* restrict w); + +void ddot(const CG_UINT n, + const CG_FLOAT* restrict e, + const CG_FLOAT* restrict y, + CG_FLOAT* restrict result); #endif // __SOLVER_H_