Partial port of communication code

This commit is contained in:
Jan Eitzinger 2025-01-17 20:30:18 +01:00
parent 512e2903c8
commit 9f37fa73a9
Signed by: moebiusband
GPG Key ID: 2574BA29B90D6DD5
2 changed files with 248 additions and 257 deletions

View File

@ -14,10 +14,6 @@
#include "allocate.h" #include "allocate.h"
#include "comm.h" #include "comm.h"
#define MAX_EXTERNAL 100000
#define MAX_NUM_MESSAGES 500
#define MAX_NUM_NEIGHBOURS MAX_NUM_MESSAGES
// subroutines local to this module // subroutines local to this module
int sizeOfRank(int rank, int size, int N) int sizeOfRank(int rank, int size, int N)
{ {
@ -37,6 +33,7 @@ void commReduction(double* v, int op)
void commPartition(Comm* c, Matrix* A) void commPartition(Comm* c, Matrix* A)
{ {
#ifdef _MPI
int rank = c->rank; int rank = c->rank;
int size = c->size; int size = c->size;
MPI_Comm comm = c->comm; MPI_Comm comm = c->comm;
@ -66,14 +63,11 @@ void commPartition(Comm* c, Matrix* A)
// - find out which processor owns the value. // - find out which processor owns the value.
// - Set up communication for sparse MV operation. // - Set up communication for sparse MV operation.
///////////////////////////////////////////
// Scan the indices and transform to local // Scan the indices and transform to local
///////////////////////////////////////////
int* external_index = (int*)allocate(ARRAY_ALIGNMENT, MAX_EXTERNAL * sizeof(int));
int* externals = (int*)allocate(ARRAY_ALIGNMENT, A->totalNr * sizeof(int)); int* externals = (int*)allocate(ARRAY_ALIGNMENT, A->totalNr * sizeof(int));
int num_external = 1; int num_external = 1;
int* external_index = (int*)allocate(ARRAY_ALIGNMENT, MAX_EXTERNAL * sizeof(int));
c->external_index = external_index; c->external_index = external_index;
for (int i = 0; i < A->totalNr; i++) { for (int i = 0; i < A->totalNr; i++) {
@ -95,49 +89,39 @@ void commPartition(Comm* c, Matrix* A)
// shift local rows to the start // shift local rows to the start
if (start_row <= cur_ind && cur_ind <= stop_row) { if (start_row <= cur_ind && cur_ind <= stop_row) {
col_ind[j] -= start_row; col_ind[j] -= start_row;
} else // Must find out if we have already set up this point } else {
{ // Must find out if we have already set up this point
if (externals[cur_ind] == -1) { if (externals[cur_ind] == -1) {
externals[cur_ind] = num_external++; externals[cur_ind] = num_external++;
if (num_external <= MAX_EXTERNAL) { if (num_external <= MAX_EXTERNAL) {
external_index[num_external - 1] = cur_ind; external_index[num_external - 1] = cur_ind;
// Mark index as external by negating it // Mark index as external by negating it
ptr_to_inds_in_row[i][j] = -(ptr_to_inds_in_row[i][j] + 1); col_ind[j] = -col_ind[j];
} else { } else {
cerr << "Must increase MAX_EXTERNAL in HPC_Sparse_Matrix.hpp" printf("Must increase MAX_EXTERNAL\n");
<< endl; exit(EXIT_FAILURE);
abort();
} }
} else { } else {
// Mark index as external by adding 1 and negating it // Mark index as external by negating it
ptr_to_inds_in_row[i][j] = -(ptr_to_inds_in_row[i][j] + 1); col_ind[j] = -col_ind[j];
} }
} }
} }
} }
//////////////////////////////////////////////////////////////////////////// /**************************************************************************
// Go through list of externals to find out which processors must be accessed. Go through list of externals to find out which processors must be accessed.
//////////////////////////////////////////////////////////////////////////// **************************************************************************/
c->num_external = num_external;
int tmp_buffer[size];
int global_index_offsets[size];
A->num_external = num_external; for (int i = 0; i < size; i++) {
int* tmp_buffer = new int[size]; // Temp buffer space needed below tmp_buffer[i] = 0;
}
// Build list of global index offset tmp_buffer[rank] = start_row;
int* global_index_offsets = new int[size];
for (i = 0; i < size; i++)
tmp_buffer[i] = 0; // First zero out
tmp_buffer[rank] = start_row; // This is my start row
// This call sends the start_row of each ith processor to the ith
// entry of global_index_offset on all processors.
// Thus, each processor know the range of indices owned by all
// other processors.
// Note: There might be a better algorithm for doing this, but this
// will work...
MPI_Allreduce(tmp_buffer, MPI_Allreduce(tmp_buffer,
global_index_offsets, global_index_offsets,
@ -147,105 +131,97 @@ void commPartition(Comm* c, Matrix* A)
MPI_COMM_WORLD); MPI_COMM_WORLD);
// Go through list of externals and find the processor that owns each // Go through list of externals and find the processor that owns each
int* external_processor = new int[num_external]; int external_processor[num_external];
int* new_external_processor = new int[num_external];
for (i = 0; i < num_external; i++) { for (int i = 0; i < num_external; i++) {
int cur_ind = external_index[i]; int cur_ind = external_index[i];
for (int j = size - 1; j >= 0; j--) for (int j = size - 1; j >= 0; j--) {
if (global_index_offsets[j] <= cur_ind) { if (global_index_offsets[j] <= cur_ind) {
external_processor[i] = j; external_processor[i] = j;
break; break;
} }
} }
if (debug) {
t0 = mytimer() - t0;
cout << " Time in finding processors phase = " << t0 << endl;
} }
//////////////////////////////////////////////////////////////////////////// /*Go through the external elements. For each newly encountered external
// Sift through the external elements. For each newly encountered external point assign it the next index in the local sequence. Then look for other
// point assign it the next index in the sequence. Then look for other external elements who are updated by the same node and assign them the next
// external elements who are update by the same node and assign them the next set of index numbers in the local sequence (ie. elements updated by the same node
// set of index numbers in the sequence (ie. elements updated by the same node have consecutive indices).*/
// have consecutive indices). int* external_local_index = (int*)allocate(ARRAY_ALIGNMENT,
//////////////////////////////////////////////////////////////////////////// MAX_EXTERNAL * sizeof(int));
c->external_local_index = external_local_index;
if (debug) t0 = mytimer();
int count = local_nrow; int count = local_nrow;
for (i = 0; i < num_external; i++)
external_local_index[i] = -1;
for (i = 0; i < num_external; i++) { for (int i = 0; i < num_external; i++) {
external_local_index[i] = -1;
}
for (int i = 0; i < num_external; i++) {
if (external_local_index[i] == -1) { if (external_local_index[i] == -1) {
external_local_index[i] = count++; external_local_index[i] = count++;
for (j = i + 1; j < num_external; j++) { for (int j = i + 1; j < num_external; j++) {
if (external_processor[j] == external_processor[i]) if (external_processor[j] == external_processor[i]) {
external_local_index[j] = count++; external_local_index[j] = count++;
} }
} }
} }
if (debug) {
t0 = mytimer() - t0;
cout << " Time in scanning external indices phase = " << t0 << endl;
} }
if (debug) t0 = mytimer();
for (i = 0; i < local_nrow; i++) { // map all external ids to the new local index
for (j = 0; j < nnz_in_row[i]; j++) { CG_UINT* rowPtr = A->rowPtr;
if (ptr_to_inds_in_row[i][j] < 0) // Change index values of externals
{ for (int i = 0; i < local_nrow; i++) {
int cur_ind = -ptr_to_inds_in_row[i][j] - 1; for (int j = rowPtr[i]; j < rowPtr[i + 1]; j++) {
ptr_to_inds_in_row[i][j] = external_local_index[externals[cur_ind]]; if (col_ind[j] < 0) {
int cur_ind = -col_ind[j] - 1; // FIXME: Offset by 1??
col_ind[j] = external_local_index[externals[cur_ind]];
} }
} }
} }
for (i = 0; i < num_external; i++) int new_external_processor[num_external];
for (int i = 0; i < num_external; i++) {
new_external_processor[i] = 0; new_external_processor[i] = 0;
}
for (i = 0; i < num_external; i++) // setup map from external id to partition
for (int i = 0; i < num_external; i++) {
new_external_processor[external_local_index[i] - local_nrow] = new_external_processor[external_local_index[i] - local_nrow] =
external_processor[i]; external_processor[i];
if (debug) {
t0 = mytimer() - t0;
cout << " Time in assigning external indices phase = " << t0 << endl;
} }
if (debug_details) { #ifdef VERBOSE
for (i = 0; i < num_external; i++) { for (int i = 0; i < num_external; i++) {
cout << "Processor " << rank << " of " << size << ": external processor[" << i printf("Process %d of %d: external process[%d] = %d\n",
<< "] = " << external_processor[i] << endl; rank,
cout << "Processor " << rank << " of " << size << ": new external processor[" size,
<< i << "] = " << new_external_processor[i] << endl; i,
} external_processor[i]);
} }
#endif
//////////////////////////////////////////////////////////////////////////// /* Count the number of neighbors from which we receive information to update
/// our external elements. Additionally, fill the array tmp_neighbors in the
// Count the number of neighbors from which we receive information to update following way:
// our external elements. Additionally, fill the array tmp_neighbors in the tmp_neighbors[i] = 0 ==> No external elements are updated by
// following way: processor i.
// tmp_neighbors[i] = 0 ==> No external elements are updated by tmp_neighbors[i] = x ==> (x-1)/size elements are updated from
// processor i. processor i.*/
// tmp_neighbors[i] = x ==> (x-1)/size elements are updated from
// processor i.
///
////////////////////////////////////////////////////////////////////////////
t0 = mytimer(); int tmp_neighbors[size];
int* tmp_neighbors = new int[size];
for (i = 0; i < size; i++) for (int i = 0; i < size; i++) {
tmp_neighbors[i] = 0; tmp_neighbors[i] = 0;
}
int num_recv_neighbors = 0; int num_recv_neighbors = 0;
int length = 1; int length = 1;
for (i = 0; i < num_external; i++) { for (int i = 0; i < num_external; i++) {
if (tmp_neighbors[new_external_processor[i]] == 0) { if (tmp_neighbors[new_external_processor[i]] == 0) {
num_recv_neighbors++; num_recv_neighbors++;
tmp_neighbors[new_external_processor[i]] = 1; tmp_neighbors[new_external_processor[i]] = 1;
@ -253,90 +229,72 @@ void commPartition(Comm* c, Matrix* A)
tmp_neighbors[new_external_processor[i]] += size; tmp_neighbors[new_external_processor[i]] += size;
} }
/// sum over all processors all the tmp_neighbors arrays /// // sum over all processors all the tmp_neighbors arrays
MPI_Allreduce(tmp_neighbors, tmp_buffer, size, MPI_INT, MPI_SUM, MPI_COMM_WORLD); MPI_Allreduce(tmp_neighbors, tmp_buffer, size, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
/// decode the combined 'tmp_neighbors' (stored in tmp_buffer) /* decode the combined 'tmp_neighbors' (stored in tmp_buffer) array from all the
// array from all the processors * processors */
int num_send_neighbors = tmp_buffer[rank] % size; int num_send_neighbors = tmp_buffer[rank] % size;
/// decode 'tmp_buffer[rank] to deduce total number of elements /* decode 'tmp_buffer[rank] to deduce total number of elements we must send */
// we must send
int total_to_be_sent = (tmp_buffer[rank] - num_send_neighbors) / size; int total_to_be_sent = (tmp_buffer[rank] - num_send_neighbors) / size;
// /* Check to see if we have enough workspace allocated. This could be
// Check to see if we have enough workspace allocated. This could be dynamically modified, but let's keep it simple for now...*/
// dynamically modified, but let's keep it simple for now...
//
if (num_send_neighbors > MAX_NUM_MESSAGES) { if (num_send_neighbors > MAX_NUM_MESSAGES) {
cerr << "Must increase MAX_NUM_MESSAGES in HPC_Sparse_Matrix.hpp" << endl; printf("Must increase MAX_NUM_MESSAGES. Must be at least %d\n",
cerr << "Must be at least " << num_send_neighbors << endl; num_send_neighbors);
abort(); exit(EXIT_FAILURE);
} }
if (total_to_be_sent > MAX_EXTERNAL) { if (total_to_be_sent > MAX_EXTERNAL) {
cerr << "Must increase MAX_EXTERNAL in HPC_Sparse_Matrix.hpp" << endl; printf("Must increase MAX_EXTERNAL. Must be at least %d\n", total_to_be_sent);
cerr << "Must be at least " << total_to_be_sent << endl; exit(EXIT_FAILURE);
abort();
} }
delete[] tmp_neighbors;
if (debug) { #ifdef VERBOSE
t0 = mytimer() - t0;
cout << " Time in finding neighbors phase = " << t0 << endl;
}
if (debug)
cout << "Processor " << rank << " of " << size cout << "Processor " << rank << " of " << size
<< ": Number of send neighbors = " << num_send_neighbors << endl; << ": Number of send neighbors = " << num_send_neighbors << endl;
if (debug)
cout << "Processor " << rank << " of " << size cout << "Processor " << rank << " of " << size
<< ": Number of receive neighbors = " << num_recv_neighbors << endl; << ": Number of receive neighbors = " << num_recv_neighbors << endl;
if (debug)
cout << "Processor " << rank << " of " << size cout << "Processor " << rank << " of " << size
<< ": Total number of elements to send = " << total_to_be_sent << endl; << ": Total number of elements to send = " << total_to_be_sent << endl;
if (debug) MPI_Barrier(MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD);
#endif
///////////////////////////////////////////////////////////////////////// /* Make a list of the neighbors that will send information to update our
/// external elements (in the order that we will receive this information).*/
// Make a list of the neighbors that will send information to update our int* recv_list = allocate(ARRAY_ALIGNMENT, MAX_EXTERNAL * sizeof(int));
// external elements (in the order that we will receive this information).
///
/////////////////////////////////////////////////////////////////////////
int* recv_list = new int[MAX_EXTERNAL]; // FIXME: Create local scope
int j = 0;
j = 0;
recv_list[j++] = new_external_processor[0]; recv_list[j++] = new_external_processor[0];
for (i = 1; i < num_external; i++) {
for (int i = 1; i < num_external; i++) {
if (new_external_processor[i - 1] != new_external_processor[i]) { if (new_external_processor[i - 1] != new_external_processor[i]) {
recv_list[j++] = new_external_processor[i]; recv_list[j++] = new_external_processor[i];
} }
} }
// // Ensure that all the neighbors we expect to receive from also send to us
// Send a 0 length message to each of our recv neighbors // Send a 0 length message to each of our recv neighbors
// int send_list[num_send_neighbors];
int* send_list = new int[num_send_neighbors]; for (int i = 0; i < num_send_neighbors; i++) {
for (i = 0; i < num_send_neighbors; i++)
send_list[i] = 0; send_list[i] = 0;
}
//
// first post receives, these are immediate receives // first post receives, these are immediate receives
// Do not wait for result to come, will do that at the // Do not wait for result to come, will do that at the
// wait call below. // wait call below.
//
int MPI_MY_TAG = 99; int MPI_MY_TAG = 99;
MPI_Request* request = new MPI_Request[MAX_NUM_MESSAGES]; MPI_Request request[MAX_NUM_MESSAGES];
for (i = 0; i < num_send_neighbors; i++) {
for (int i = 0; i < num_send_neighbors; i++) {
MPI_Irecv(tmp_buffer + i, MPI_Irecv(tmp_buffer + i,
1, 1,
MPI_INT, MPI_INT,
@ -347,88 +305,77 @@ void commPartition(Comm* c, Matrix* A)
} }
// send messages // send messages
for (int i = 0; i < num_recv_neighbors; i++) {
for (i = 0; i < num_recv_neighbors; i++)
MPI_Send(tmp_buffer + i, 1, MPI_INT, recv_list[i], MPI_MY_TAG, MPI_COMM_WORLD); MPI_Send(tmp_buffer + i, 1, MPI_INT, recv_list[i], MPI_MY_TAG, MPI_COMM_WORLD);
/// }
// Receive message from each send neighbor to construct 'send_list'.
///
// Receive message from each send neighbor to construct 'send_list'.
MPI_Status status; MPI_Status status;
for (i = 0; i < num_send_neighbors; i++) { for (int i = 0; i < num_send_neighbors; i++) {
if (MPI_Wait(request + i, &status)) { if (MPI_Wait(request + i, &status)) {
cerr << "MPI_Wait error\n" << endl; printf("MPI_Wait error\n");
exit(-1); exit(EXIT_FAILURE);
} }
send_list[i] = status.MPI_SOURCE; send_list[i] = status.MPI_SOURCE;
} }
///////////////////////////////////////////////////////////////////////// /* Compare the two lists. In most cases they should be the same.
///
// Compare the two lists. In most cases they should be the same.
// However, if they are not then add new entries to the recv list // However, if they are not then add new entries to the recv list
// that are in the send list (but not already in the recv list). // that are in the send list (but not already in the recv list).
/// WHY!! This ensures that the sendlist is equal to the sendlist
///////////////////////////////////////////////////////////////////////// But why is this required? -> Just One neighbour list??*/
for (int j = 0; j < num_send_neighbors; j++) {
for (j = 0; j < num_send_neighbors; j++) {
int found = 0; int found = 0;
for (i = 0; i < num_recv_neighbors; i++) { for (int i = 0; i < num_recv_neighbors; i++) {
if (recv_list[i] == send_list[j]) found = 1; if (recv_list[i] == send_list[j]) found = 1;
} }
if (found == 0) { if (found == 0) {
if (debug) #ifdef VERBOSE
cout << "Processor " << rank << " of " << size << ": recv_list[" printf("Process %d of %d: recv_list[%d] = %d\n",
<< num_recv_neighbors << "] = " << send_list[j] << endl; rank,
size,
num_recv_neighbors,
send_list[i]);
#endif
recv_list[num_recv_neighbors] = send_list[j]; recv_list[num_recv_neighbors] = send_list[j];
(num_recv_neighbors)++; (num_recv_neighbors)++;
} }
} }
delete[] send_list;
num_send_neighbors = num_recv_neighbors; num_send_neighbors = num_recv_neighbors;
if (num_send_neighbors > MAX_NUM_MESSAGES) { if (num_send_neighbors > MAX_NUM_MESSAGES) {
cerr << "Must increase MAX_EXTERNAL in HPC_Sparse_Matrix.hpp" << endl; printf("Must increase MAX_EXTERNAL\n");
abort(); exit(EXIT_FAILURE);
} }
///////////////////////////////////////////////////////////////////////// // Start filling communication setup
/// Start filling HPC_Sparse_Matrix struct
/////////////////////////////////////////////////////////////////////////
A->total_to_be_sent = total_to_be_sent;
int* elements_to_send = new int[total_to_be_sent];
A->elements_to_send = elements_to_send;
for (i = 0; i < total_to_be_sent; i++)
elements_to_send[i] = 0;
//
// Create 'new_external' which explicitly put the external elements in the // Create 'new_external' which explicitly put the external elements in the
// order given by 'external_local_index' // order given by 'external_local_index'
// c->total_to_be_sent = total_to_be_sent;
int* elements_to_send = (int*)allocate(ARRAY_ALIGNMENT,
total_to_be_sent * sizeof(int));
c->elements_to_send = elements_to_send;
int* new_external = new int[num_external]; for (int i = 0; i < total_to_be_sent; i++) {
for (i = 0; i < num_external; i++) { elements_to_send[i] = 0;
}
// Create 'new_external' which explicitly put the external elements in the
// order given by 'external_local_index'
int* new_external = (int*)allocate(ARRAY_ALIGNMENT, num_external * sizeof(int));
for (int i = 0; i < num_external; i++) {
new_external[external_local_index[i] - local_nrow] = external_index[i]; new_external[external_local_index[i] - local_nrow] = external_index[i];
} }
/////////////////////////////////////////////////////////////////////////
//
// Send each processor the global index list of the external elements in the // Send each processor the global index list of the external elements in the
// order that I will want to receive them when updating my external elements // order that I will want to receive them when updating my external elements
// int lengths[num_recv_neighbors];
/////////////////////////////////////////////////////////////////////////
int* lengths = new int[num_recv_neighbors];
MPI_MY_TAG++; MPI_MY_TAG++;
// First post receives // First post receives
for (int i = 0; i < num_recv_neighbors; i++) {
for (i = 0; i < num_recv_neighbors; i++) {
int partner = recv_list[i]; int partner = recv_list[i];
MPI_Irecv(lengths + i, MPI_Irecv(lengths + i,
1, 1,
@ -439,22 +386,18 @@ void commPartition(Comm* c, Matrix* A)
request + i); request + i);
} }
int* neighbors = new int[MAX_NUM_NEIGHBOURS]; int* neighbors = c->neighbors;
int* recv_length = new int[MAX_NUM_NEIGHBOURS]; int* recv_length = c->recv_length;
int* send_length = new int[MAX_NUM_NEIGHBOURS]; int* send_length = c->send_length;
A->neighbors = neighbors;
A->recv_length = recv_length;
A->send_length = send_length;
j = 0; j = 0;
for (i = 0; i < num_recv_neighbors; i++) {
for (int i = 0; i < num_recv_neighbors; i++) {
int start = j; int start = j;
int newlength = 0; int newlength = 0;
// go through list of external elements until updating // go through list of external elements until updating
// processor changes // processor changes
while ((j < num_external) && (new_external_processor[j] == recv_list[i])) { while ((j < num_external) && (new_external_processor[j] == recv_list[i])) {
newlength++; newlength++;
j++; j++;
@ -469,25 +412,21 @@ void commPartition(Comm* c, Matrix* A)
} }
// Complete the receives of the number of externals // Complete the receives of the number of externals
for (int i = 0; i < num_recv_neighbors; i++) {
for (i = 0; i < num_recv_neighbors; i++) {
if (MPI_Wait(request + i, &status)) { if (MPI_Wait(request + i, &status)) {
cerr << "MPI_Wait error\n" << endl; printf("MPI_Wait error\n");
exit(-1); exit(EXIT_FAILURE);
} }
send_length[i] = lengths[i]; send_length[i] = lengths[i];
} }
delete[] lengths;
///////////////////////////////////////////////////////////////////
// Build "elements_to_send" list. These are the x elements I own // Build "elements_to_send" list. These are the x elements I own
// that need to be sent to other processors. // that need to be sent to other processors.
///////////////////////////////////////////////////////////////////
MPI_MY_TAG++; MPI_MY_TAG++;
j = 0; j = 0;
for (i = 0; i < num_recv_neighbors; i++) {
for (int i = 0; i < num_recv_neighbors; i++) {
MPI_Irecv(elements_to_send + j, MPI_Irecv(elements_to_send + j,
send_length[i], send_length[i],
MPI_INT, MPI_INT,
@ -499,14 +438,14 @@ void commPartition(Comm* c, Matrix* A)
} }
j = 0; j = 0;
for (i = 0; i < num_recv_neighbors; i++) {
for (int i = 0; i < num_recv_neighbors; i++) {
int start = j; int start = j;
int newlength = 0; int newlength = 0;
// Go through list of external elements // Go through list of external elements
// until updating processor changes. This is redundant, but // until updating processor changes. This is redundant, but
// saves us from recording this information. // saves us from recording this information.
while ((j < num_external) && (new_external_processor[j] == recv_list[i])) { while ((j < num_external) && (new_external_processor[j] == recv_list[i])) {
newlength++; newlength++;
@ -522,39 +461,101 @@ void commPartition(Comm* c, Matrix* A)
} }
// receive from each neighbor the global index list of external elements // receive from each neighbor the global index list of external elements
for (int i = 0; i < num_recv_neighbors; i++) {
for (i = 0; i < num_recv_neighbors; i++) {
if (MPI_Wait(request + i, &status)) { if (MPI_Wait(request + i, &status)) {
cerr << "MPI_Wait error\n" << endl; printf("MPI_Wait error\n");
exit(-1); exit(EXIT_FAILURE);
} }
} }
/// replace global indices by local indices /// /// replace global indices by local indices
for (int i = 0; i < total_to_be_sent; i++) {
for (i = 0; i < total_to_be_sent; i++)
elements_to_send[i] -= start_row; elements_to_send[i] -= start_row;
}
////////////////
// Finish up !! // Finish up !!
//////////////// c->num_send_neighbors = num_send_neighbors;
A->nc = A->nc + num_external;
A->num_send_neighbors = num_send_neighbors; // Used in exchange
A->local_ncol = A->local_nrow + num_external; CG_FLOAT* send_buffer = (CG_FLOAT*)allocate(ARRAY_ALIGNMENT,
total_to_be_sent * sizeof(CG_FLOAT));
c->send_buffer = send_buffer;
// Used in exchange_externals free(recv_list);
double* send_buffer = new double[total_to_be_sent]; free(new_external);
A->send_buffer = send_buffer; #endif
}
delete[] tmp_buffer; void commExchange(Comm* c, Matrix* A, double* x)
delete[] global_index_offsets; {
delete[] recv_list; #ifdef _MPI
delete[] external_processor; int num_external = 0;
delete[] new_external;
delete[] new_external_processor;
delete[] request;
return; // Extract Matrix pieces
int local_nrow = A->nr;
int num_neighbors = c->num_send_neighbors;
int* recv_length = c->recv_length;
int* send_length = c->send_length;
int* neighbors = c->neighbors;
double* send_buffer = c->send_buffer;
int total_to_be_sent = c->total_to_be_sent;
int* elements_to_send = c->elements_to_send;
int rank = c->rank;
int size = c->size;
MPI_Comm comm = c->comm;
// first post receives, these are immediate receives
// Do not wait for result to come, will do that at the
// wait call below.
int MPI_MY_TAG = 99;
MPI_Request request[num_neighbors];
// Externals are at end of locals
double* x_external = (double*)x + local_nrow;
// Post receives first
for (int i = 0; i < num_neighbors; i++) {
int n_recv = recv_length[i];
MPI_Irecv(x_external,
n_recv,
MPI_DOUBLE,
neighbors[i],
MPI_MY_TAG,
MPI_COMM_WORLD,
request + i);
x_external += n_recv;
}
// Fill up send buffer
for (int i = 0; i < total_to_be_sent; i++) {
send_buffer[i] = x[elements_to_send[i]];
}
// Send to each neighbor
for (int i = 0; i < num_neighbors; i++) {
int n_send = send_length[i];
MPI_Send(send_buffer,
n_send,
MPI_DOUBLE,
neighbors[i],
MPI_MY_TAG,
MPI_COMM_WORLD);
send_buffer += n_send;
}
// Complete the reads issued above
MPI_Status status;
for (int i = 0; i < num_neighbors; i++) {
if (MPI_Wait(request + i, &status)) {
printf("MPI_Wait error\n");
exit(EXIT_FAILURE);
}
}
#endif
} }
void commPrintConfig(Comm* c) void commPrintConfig(Comm* c)
@ -568,20 +569,6 @@ void commPrintConfig(Comm* c)
for (int i = 0; i < c->size; i++) { for (int i = 0; i < c->size; i++) {
if (i == c->rank) { if (i == c->rank) {
printf("\tRank %d of %d\n", c->rank, c->size);
printf("\tNeighbours (bottom, top, left, right): %d %d, %d, %d\n",
c->neighbours[BOTTOM],
c->neighbours[TOP],
c->neighbours[LEFT],
c->neighbours[RIGHT]);
printf("\tIs boundary:\n");
printf("\t\tLEFT: %d\n", commIsBoundary(c, LEFT));
printf("\t\tRIGHT: %d\n", commIsBoundary(c, RIGHT));
printf("\t\tBOTTOM: %d\n", commIsBoundary(c, BOTTOM));
printf("\t\tTOP: %d\n", commIsBoundary(c, TOP));
printf("\tCoordinates (i,j) %d %d\n", c->coords[IDIM], c->coords[JDIM]);
printf("\tDims (i,j) %d %d\n", c->dims[IDIM], c->dims[JDIM]);
printf("\tLocal domain size (i,j) %dx%d\n", c->imaxLocal, c->jmaxLocal);
fflush(stdout); fflush(stdout);
} }
MPI_Barrier(MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD);

View File

@ -10,6 +10,10 @@
#include "matrix.h" #include "matrix.h"
#define MAX_EXTERNAL 100000
#define MAX_NUM_MESSAGES 500
#define MAX_NUM_NEIGHBOURS MAX_NUM_MESSAGES
enum op { MAX = 0, SUM }; enum op { MAX = 0, SUM };
typedef struct { typedef struct {
@ -24,9 +28,9 @@ typedef struct {
int* external_local_index; int* external_local_index;
int total_to_be_sent; int total_to_be_sent;
int* elements_to_send; int* elements_to_send;
int* neighbors; int neighbors[MAX_NUM_NEIGHBOURS];
int* recv_length; int recv_length[MAX_NUM_NEIGHBOURS];
int* send_length; int send_length[MAX_NUM_NEIGHBOURS];
double* send_buffer; double* send_buffer;
#endif #endif
} Comm; } Comm;
@ -36,7 +40,7 @@ extern void commInit(Comm* c, int argc, char** argv);
extern void commFinalize(Comm* c); extern void commFinalize(Comm* c);
extern void commPartition(Comm*, Matrix* m); extern void commPartition(Comm*, Matrix* m);
extern void commPrintConfig(Comm*); extern void commPrintConfig(Comm*);
extern void commExchange(Comm*, double*); extern void commExchange(Comm* c, Matrix* A, double* x);
extern void commReduction(double* v, int op); extern void commReduction(double* v, int op);
static inline int commIsMaster(Comm* c) { return c->rank == 0; } static inline int commIsMaster(Comm* c) { return c->rank == 0; }