Partial port of communication code
This commit is contained in:
parent
512e2903c8
commit
9f37fa73a9
493
src/comm.c
493
src/comm.c
@ -14,10 +14,6 @@
|
||||
#include "allocate.h"
|
||||
#include "comm.h"
|
||||
|
||||
#define MAX_EXTERNAL 100000
|
||||
#define MAX_NUM_MESSAGES 500
|
||||
#define MAX_NUM_NEIGHBOURS MAX_NUM_MESSAGES
|
||||
|
||||
// subroutines local to this module
|
||||
int sizeOfRank(int rank, int size, int N)
|
||||
{
|
||||
@ -37,6 +33,7 @@ void commReduction(double* v, int op)
|
||||
|
||||
void commPartition(Comm* c, Matrix* A)
|
||||
{
|
||||
#ifdef _MPI
|
||||
int rank = c->rank;
|
||||
int size = c->size;
|
||||
MPI_Comm comm = c->comm;
|
||||
@ -66,15 +63,12 @@ void commPartition(Comm* c, Matrix* A)
|
||||
// - find out which processor owns the value.
|
||||
// - Set up communication for sparse MV operation.
|
||||
|
||||
///////////////////////////////////////////
|
||||
// Scan the indices and transform to local
|
||||
///////////////////////////////////////////
|
||||
int* externals = (int*)allocate(ARRAY_ALIGNMENT, A->totalNr * sizeof(int));
|
||||
int num_external = 1;
|
||||
|
||||
int* external_index = (int*)allocate(ARRAY_ALIGNMENT, MAX_EXTERNAL * sizeof(int));
|
||||
int* externals = (int*)allocate(ARRAY_ALIGNMENT, A->totalNr * sizeof(int));
|
||||
int num_external = 1;
|
||||
|
||||
c->external_index = external_index;
|
||||
c->external_index = external_index;
|
||||
|
||||
for (int i = 0; i < A->totalNr; i++) {
|
||||
externals[i] = -1;
|
||||
@ -95,49 +89,39 @@ void commPartition(Comm* c, Matrix* A)
|
||||
// shift local rows to the start
|
||||
if (start_row <= cur_ind && cur_ind <= stop_row) {
|
||||
col_ind[j] -= start_row;
|
||||
} else // Must find out if we have already set up this point
|
||||
{
|
||||
} else {
|
||||
// Must find out if we have already set up this point
|
||||
if (externals[cur_ind] == -1) {
|
||||
externals[cur_ind] = num_external++;
|
||||
|
||||
if (num_external <= MAX_EXTERNAL) {
|
||||
external_index[num_external - 1] = cur_ind;
|
||||
// Mark index as external by negating it
|
||||
ptr_to_inds_in_row[i][j] = -(ptr_to_inds_in_row[i][j] + 1);
|
||||
col_ind[j] = -col_ind[j];
|
||||
} else {
|
||||
cerr << "Must increase MAX_EXTERNAL in HPC_Sparse_Matrix.hpp"
|
||||
<< endl;
|
||||
abort();
|
||||
printf("Must increase MAX_EXTERNAL\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
} else {
|
||||
// Mark index as external by adding 1 and negating it
|
||||
ptr_to_inds_in_row[i][j] = -(ptr_to_inds_in_row[i][j] + 1);
|
||||
// Mark index as external by negating it
|
||||
col_ind[j] = -col_ind[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Go through list of externals to find out which processors must be accessed.
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
/**************************************************************************
|
||||
Go through list of externals to find out which processors must be accessed.
|
||||
**************************************************************************/
|
||||
c->num_external = num_external;
|
||||
int tmp_buffer[size];
|
||||
int global_index_offsets[size];
|
||||
|
||||
A->num_external = num_external;
|
||||
int* tmp_buffer = new int[size]; // Temp buffer space needed below
|
||||
for (int i = 0; i < size; i++) {
|
||||
tmp_buffer[i] = 0;
|
||||
}
|
||||
|
||||
// Build list of global index offset
|
||||
|
||||
int* global_index_offsets = new int[size];
|
||||
for (i = 0; i < size; i++)
|
||||
tmp_buffer[i] = 0; // First zero out
|
||||
|
||||
tmp_buffer[rank] = start_row; // This is my start row
|
||||
|
||||
// This call sends the start_row of each ith processor to the ith
|
||||
// entry of global_index_offset on all processors.
|
||||
// Thus, each processor know the range of indices owned by all
|
||||
// other processors.
|
||||
// Note: There might be a better algorithm for doing this, but this
|
||||
// will work...
|
||||
tmp_buffer[rank] = start_row;
|
||||
|
||||
MPI_Allreduce(tmp_buffer,
|
||||
global_index_offsets,
|
||||
@ -147,105 +131,97 @@ void commPartition(Comm* c, Matrix* A)
|
||||
MPI_COMM_WORLD);
|
||||
|
||||
// Go through list of externals and find the processor that owns each
|
||||
int* external_processor = new int[num_external];
|
||||
int* new_external_processor = new int[num_external];
|
||||
int external_processor[num_external];
|
||||
|
||||
for (i = 0; i < num_external; i++) {
|
||||
for (int i = 0; i < num_external; i++) {
|
||||
int cur_ind = external_index[i];
|
||||
for (int j = size - 1; j >= 0; j--)
|
||||
for (int j = size - 1; j >= 0; j--) {
|
||||
if (global_index_offsets[j] <= cur_ind) {
|
||||
external_processor[i] = j;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (debug) {
|
||||
t0 = mytimer() - t0;
|
||||
cout << " Time in finding processors phase = " << t0 << endl;
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Sift through the external elements. For each newly encountered external
|
||||
// point assign it the next index in the sequence. Then look for other
|
||||
// external elements who are update by the same node and assign them the next
|
||||
// set of index numbers in the sequence (ie. elements updated by the same node
|
||||
// have consecutive indices).
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
if (debug) t0 = mytimer();
|
||||
/*Go through the external elements. For each newly encountered external
|
||||
point assign it the next index in the local sequence. Then look for other
|
||||
external elements who are updated by the same node and assign them the next
|
||||
set of index numbers in the local sequence (ie. elements updated by the same node
|
||||
have consecutive indices).*/
|
||||
int* external_local_index = (int*)allocate(ARRAY_ALIGNMENT,
|
||||
MAX_EXTERNAL * sizeof(int));
|
||||
c->external_local_index = external_local_index;
|
||||
|
||||
int count = local_nrow;
|
||||
for (i = 0; i < num_external; i++)
|
||||
external_local_index[i] = -1;
|
||||
|
||||
for (i = 0; i < num_external; i++) {
|
||||
for (int i = 0; i < num_external; i++) {
|
||||
external_local_index[i] = -1;
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_external; i++) {
|
||||
if (external_local_index[i] == -1) {
|
||||
external_local_index[i] = count++;
|
||||
|
||||
for (j = i + 1; j < num_external; j++) {
|
||||
if (external_processor[j] == external_processor[i])
|
||||
for (int j = i + 1; j < num_external; j++) {
|
||||
if (external_processor[j] == external_processor[i]) {
|
||||
external_local_index[j] = count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (debug) {
|
||||
t0 = mytimer() - t0;
|
||||
cout << " Time in scanning external indices phase = " << t0 << endl;
|
||||
}
|
||||
if (debug) t0 = mytimer();
|
||||
// map all external ids to the new local index
|
||||
CG_UINT* rowPtr = A->rowPtr;
|
||||
|
||||
for (i = 0; i < local_nrow; i++) {
|
||||
for (j = 0; j < nnz_in_row[i]; j++) {
|
||||
if (ptr_to_inds_in_row[i][j] < 0) // Change index values of externals
|
||||
{
|
||||
int cur_ind = -ptr_to_inds_in_row[i][j] - 1;
|
||||
ptr_to_inds_in_row[i][j] = external_local_index[externals[cur_ind]];
|
||||
for (int i = 0; i < local_nrow; i++) {
|
||||
for (int j = rowPtr[i]; j < rowPtr[i + 1]; j++) {
|
||||
if (col_ind[j] < 0) {
|
||||
int cur_ind = -col_ind[j] - 1; // FIXME: Offset by 1??
|
||||
col_ind[j] = external_local_index[externals[cur_ind]];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < num_external; i++)
|
||||
int new_external_processor[num_external];
|
||||
|
||||
for (int i = 0; i < num_external; i++) {
|
||||
new_external_processor[i] = 0;
|
||||
}
|
||||
|
||||
for (i = 0; i < num_external; i++)
|
||||
// setup map from external id to partition
|
||||
for (int i = 0; i < num_external; i++) {
|
||||
new_external_processor[external_local_index[i] - local_nrow] =
|
||||
external_processor[i];
|
||||
|
||||
if (debug) {
|
||||
t0 = mytimer() - t0;
|
||||
cout << " Time in assigning external indices phase = " << t0 << endl;
|
||||
}
|
||||
|
||||
if (debug_details) {
|
||||
for (i = 0; i < num_external; i++) {
|
||||
cout << "Processor " << rank << " of " << size << ": external processor[" << i
|
||||
<< "] = " << external_processor[i] << endl;
|
||||
cout << "Processor " << rank << " of " << size << ": new external processor["
|
||||
<< i << "] = " << new_external_processor[i] << endl;
|
||||
}
|
||||
#ifdef VERBOSE
|
||||
for (int i = 0; i < num_external; i++) {
|
||||
printf("Process %d of %d: external process[%d] = %d\n",
|
||||
rank,
|
||||
size,
|
||||
i,
|
||||
external_processor[i]);
|
||||
}
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
///
|
||||
// Count the number of neighbors from which we receive information to update
|
||||
// our external elements. Additionally, fill the array tmp_neighbors in the
|
||||
// following way:
|
||||
// tmp_neighbors[i] = 0 ==> No external elements are updated by
|
||||
// processor i.
|
||||
// tmp_neighbors[i] = x ==> (x-1)/size elements are updated from
|
||||
// processor i.
|
||||
///
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
/* Count the number of neighbors from which we receive information to update
|
||||
our external elements. Additionally, fill the array tmp_neighbors in the
|
||||
following way:
|
||||
tmp_neighbors[i] = 0 ==> No external elements are updated by
|
||||
processor i.
|
||||
tmp_neighbors[i] = x ==> (x-1)/size elements are updated from
|
||||
processor i.*/
|
||||
|
||||
t0 = mytimer();
|
||||
int* tmp_neighbors = new int[size];
|
||||
for (i = 0; i < size; i++)
|
||||
int tmp_neighbors[size];
|
||||
|
||||
for (int i = 0; i < size; i++) {
|
||||
tmp_neighbors[i] = 0;
|
||||
}
|
||||
|
||||
int num_recv_neighbors = 0;
|
||||
int length = 1;
|
||||
|
||||
for (i = 0; i < num_external; i++) {
|
||||
for (int i = 0; i < num_external; i++) {
|
||||
if (tmp_neighbors[new_external_processor[i]] == 0) {
|
||||
num_recv_neighbors++;
|
||||
tmp_neighbors[new_external_processor[i]] = 1;
|
||||
@ -253,90 +229,72 @@ void commPartition(Comm* c, Matrix* A)
|
||||
tmp_neighbors[new_external_processor[i]] += size;
|
||||
}
|
||||
|
||||
/// sum over all processors all the tmp_neighbors arrays ///
|
||||
|
||||
// sum over all processors all the tmp_neighbors arrays
|
||||
MPI_Allreduce(tmp_neighbors, tmp_buffer, size, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
|
||||
|
||||
/// decode the combined 'tmp_neighbors' (stored in tmp_buffer)
|
||||
// array from all the processors
|
||||
|
||||
/* decode the combined 'tmp_neighbors' (stored in tmp_buffer) array from all the
|
||||
* processors */
|
||||
int num_send_neighbors = tmp_buffer[rank] % size;
|
||||
|
||||
/// decode 'tmp_buffer[rank] to deduce total number of elements
|
||||
// we must send
|
||||
|
||||
/* decode 'tmp_buffer[rank] to deduce total number of elements we must send */
|
||||
int total_to_be_sent = (tmp_buffer[rank] - num_send_neighbors) / size;
|
||||
|
||||
//
|
||||
// Check to see if we have enough workspace allocated. This could be
|
||||
// dynamically modified, but let's keep it simple for now...
|
||||
//
|
||||
|
||||
/* Check to see if we have enough workspace allocated. This could be
|
||||
dynamically modified, but let's keep it simple for now...*/
|
||||
if (num_send_neighbors > MAX_NUM_MESSAGES) {
|
||||
cerr << "Must increase MAX_NUM_MESSAGES in HPC_Sparse_Matrix.hpp" << endl;
|
||||
cerr << "Must be at least " << num_send_neighbors << endl;
|
||||
abort();
|
||||
printf("Must increase MAX_NUM_MESSAGES. Must be at least %d\n",
|
||||
num_send_neighbors);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
if (total_to_be_sent > MAX_EXTERNAL) {
|
||||
cerr << "Must increase MAX_EXTERNAL in HPC_Sparse_Matrix.hpp" << endl;
|
||||
cerr << "Must be at least " << total_to_be_sent << endl;
|
||||
abort();
|
||||
printf("Must increase MAX_EXTERNAL. Must be at least %d\n", total_to_be_sent);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
delete[] tmp_neighbors;
|
||||
|
||||
if (debug) {
|
||||
t0 = mytimer() - t0;
|
||||
cout << " Time in finding neighbors phase = " << t0 << endl;
|
||||
}
|
||||
if (debug)
|
||||
cout << "Processor " << rank << " of " << size
|
||||
<< ": Number of send neighbors = " << num_send_neighbors << endl;
|
||||
#ifdef VERBOSE
|
||||
cout << "Processor " << rank << " of " << size
|
||||
<< ": Number of send neighbors = " << num_send_neighbors << endl;
|
||||
|
||||
if (debug)
|
||||
cout << "Processor " << rank << " of " << size
|
||||
<< ": Number of receive neighbors = " << num_recv_neighbors << endl;
|
||||
cout << "Processor " << rank << " of " << size
|
||||
<< ": Number of receive neighbors = " << num_recv_neighbors << endl;
|
||||
|
||||
if (debug)
|
||||
cout << "Processor " << rank << " of " << size
|
||||
<< ": Total number of elements to send = " << total_to_be_sent << endl;
|
||||
cout << "Processor " << rank << " of " << size
|
||||
<< ": Total number of elements to send = " << total_to_be_sent << endl;
|
||||
|
||||
if (debug) MPI_Barrier(MPI_COMM_WORLD);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
#endif
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
///
|
||||
// Make a list of the neighbors that will send information to update our
|
||||
// external elements (in the order that we will receive this information).
|
||||
///
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
/* Make a list of the neighbors that will send information to update our
|
||||
external elements (in the order that we will receive this information).*/
|
||||
int* recv_list = allocate(ARRAY_ALIGNMENT, MAX_EXTERNAL * sizeof(int));
|
||||
|
||||
int* recv_list = new int[MAX_EXTERNAL];
|
||||
|
||||
j = 0;
|
||||
// FIXME: Create local scope
|
||||
int j = 0;
|
||||
recv_list[j++] = new_external_processor[0];
|
||||
for (i = 1; i < num_external; i++) {
|
||||
|
||||
for (int i = 1; i < num_external; i++) {
|
||||
if (new_external_processor[i - 1] != new_external_processor[i]) {
|
||||
recv_list[j++] = new_external_processor[i];
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Ensure that all the neighbors we expect to receive from also send to us
|
||||
// Send a 0 length message to each of our recv neighbors
|
||||
//
|
||||
int send_list[num_send_neighbors];
|
||||
|
||||
int* send_list = new int[num_send_neighbors];
|
||||
for (i = 0; i < num_send_neighbors; i++)
|
||||
for (int i = 0; i < num_send_neighbors; i++) {
|
||||
send_list[i] = 0;
|
||||
}
|
||||
|
||||
//
|
||||
// first post receives, these are immediate receives
|
||||
// Do not wait for result to come, will do that at the
|
||||
// wait call below.
|
||||
//
|
||||
int MPI_MY_TAG = 99;
|
||||
|
||||
MPI_Request* request = new MPI_Request[MAX_NUM_MESSAGES];
|
||||
for (i = 0; i < num_send_neighbors; i++) {
|
||||
MPI_Request request[MAX_NUM_MESSAGES];
|
||||
|
||||
for (int i = 0; i < num_send_neighbors; i++) {
|
||||
MPI_Irecv(tmp_buffer + i,
|
||||
1,
|
||||
MPI_INT,
|
||||
@ -347,88 +305,77 @@ void commPartition(Comm* c, Matrix* A)
|
||||
}
|
||||
|
||||
// send messages
|
||||
|
||||
for (i = 0; i < num_recv_neighbors; i++)
|
||||
for (int i = 0; i < num_recv_neighbors; i++) {
|
||||
MPI_Send(tmp_buffer + i, 1, MPI_INT, recv_list[i], MPI_MY_TAG, MPI_COMM_WORLD);
|
||||
///
|
||||
// Receive message from each send neighbor to construct 'send_list'.
|
||||
///
|
||||
}
|
||||
|
||||
// Receive message from each send neighbor to construct 'send_list'.
|
||||
MPI_Status status;
|
||||
for (i = 0; i < num_send_neighbors; i++) {
|
||||
for (int i = 0; i < num_send_neighbors; i++) {
|
||||
if (MPI_Wait(request + i, &status)) {
|
||||
cerr << "MPI_Wait error\n" << endl;
|
||||
exit(-1);
|
||||
printf("MPI_Wait error\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
send_list[i] = status.MPI_SOURCE;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
///
|
||||
// Compare the two lists. In most cases they should be the same.
|
||||
/* Compare the two lists. In most cases they should be the same.
|
||||
// However, if they are not then add new entries to the recv list
|
||||
// that are in the send list (but not already in the recv list).
|
||||
///
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
|
||||
for (j = 0; j < num_send_neighbors; j++) {
|
||||
WHY!! This ensures that the sendlist is equal to the sendlist
|
||||
But why is this required? -> Just One neighbour list??*/
|
||||
for (int j = 0; j < num_send_neighbors; j++) {
|
||||
int found = 0;
|
||||
for (i = 0; i < num_recv_neighbors; i++) {
|
||||
for (int i = 0; i < num_recv_neighbors; i++) {
|
||||
if (recv_list[i] == send_list[j]) found = 1;
|
||||
}
|
||||
|
||||
if (found == 0) {
|
||||
if (debug)
|
||||
cout << "Processor " << rank << " of " << size << ": recv_list["
|
||||
<< num_recv_neighbors << "] = " << send_list[j] << endl;
|
||||
#ifdef VERBOSE
|
||||
printf("Process %d of %d: recv_list[%d] = %d\n",
|
||||
rank,
|
||||
size,
|
||||
num_recv_neighbors,
|
||||
send_list[i]);
|
||||
#endif
|
||||
recv_list[num_recv_neighbors] = send_list[j];
|
||||
(num_recv_neighbors)++;
|
||||
}
|
||||
}
|
||||
|
||||
delete[] send_list;
|
||||
num_send_neighbors = num_recv_neighbors;
|
||||
|
||||
if (num_send_neighbors > MAX_NUM_MESSAGES) {
|
||||
cerr << "Must increase MAX_EXTERNAL in HPC_Sparse_Matrix.hpp" << endl;
|
||||
abort();
|
||||
printf("Must increase MAX_EXTERNAL\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
/// Start filling HPC_Sparse_Matrix struct
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
|
||||
A->total_to_be_sent = total_to_be_sent;
|
||||
int* elements_to_send = new int[total_to_be_sent];
|
||||
A->elements_to_send = elements_to_send;
|
||||
|
||||
for (i = 0; i < total_to_be_sent; i++)
|
||||
elements_to_send[i] = 0;
|
||||
|
||||
//
|
||||
// Start filling communication setup
|
||||
// Create 'new_external' which explicitly put the external elements in the
|
||||
// order given by 'external_local_index'
|
||||
//
|
||||
c->total_to_be_sent = total_to_be_sent;
|
||||
int* elements_to_send = (int*)allocate(ARRAY_ALIGNMENT,
|
||||
total_to_be_sent * sizeof(int));
|
||||
c->elements_to_send = elements_to_send;
|
||||
|
||||
int* new_external = new int[num_external];
|
||||
for (i = 0; i < num_external; i++) {
|
||||
for (int i = 0; i < total_to_be_sent; i++) {
|
||||
elements_to_send[i] = 0;
|
||||
}
|
||||
|
||||
// Create 'new_external' which explicitly put the external elements in the
|
||||
// order given by 'external_local_index'
|
||||
int* new_external = (int*)allocate(ARRAY_ALIGNMENT, num_external * sizeof(int));
|
||||
|
||||
for (int i = 0; i < num_external; i++) {
|
||||
new_external[external_local_index[i] - local_nrow] = external_index[i];
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Send each processor the global index list of the external elements in the
|
||||
// order that I will want to receive them when updating my external elements
|
||||
//
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
|
||||
int* lengths = new int[num_recv_neighbors];
|
||||
|
||||
int lengths[num_recv_neighbors];
|
||||
MPI_MY_TAG++;
|
||||
|
||||
// First post receives
|
||||
|
||||
for (i = 0; i < num_recv_neighbors; i++) {
|
||||
for (int i = 0; i < num_recv_neighbors; i++) {
|
||||
int partner = recv_list[i];
|
||||
MPI_Irecv(lengths + i,
|
||||
1,
|
||||
@ -439,22 +386,18 @@ void commPartition(Comm* c, Matrix* A)
|
||||
request + i);
|
||||
}
|
||||
|
||||
int* neighbors = new int[MAX_NUM_NEIGHBOURS];
|
||||
int* recv_length = new int[MAX_NUM_NEIGHBOURS];
|
||||
int* send_length = new int[MAX_NUM_NEIGHBOURS];
|
||||
|
||||
A->neighbors = neighbors;
|
||||
A->recv_length = recv_length;
|
||||
A->send_length = send_length;
|
||||
int* neighbors = c->neighbors;
|
||||
int* recv_length = c->recv_length;
|
||||
int* send_length = c->send_length;
|
||||
|
||||
j = 0;
|
||||
for (i = 0; i < num_recv_neighbors; i++) {
|
||||
|
||||
for (int i = 0; i < num_recv_neighbors; i++) {
|
||||
int start = j;
|
||||
int newlength = 0;
|
||||
|
||||
// go through list of external elements until updating
|
||||
// processor changes
|
||||
|
||||
while ((j < num_external) && (new_external_processor[j] == recv_list[i])) {
|
||||
newlength++;
|
||||
j++;
|
||||
@ -469,25 +412,21 @@ void commPartition(Comm* c, Matrix* A)
|
||||
}
|
||||
|
||||
// Complete the receives of the number of externals
|
||||
|
||||
for (i = 0; i < num_recv_neighbors; i++) {
|
||||
for (int i = 0; i < num_recv_neighbors; i++) {
|
||||
if (MPI_Wait(request + i, &status)) {
|
||||
cerr << "MPI_Wait error\n" << endl;
|
||||
exit(-1);
|
||||
printf("MPI_Wait error\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
send_length[i] = lengths[i];
|
||||
}
|
||||
delete[] lengths;
|
||||
|
||||
///////////////////////////////////////////////////////////////////
|
||||
// Build "elements_to_send" list. These are the x elements I own
|
||||
// that need to be sent to other processors.
|
||||
///////////////////////////////////////////////////////////////////
|
||||
|
||||
MPI_MY_TAG++;
|
||||
|
||||
j = 0;
|
||||
for (i = 0; i < num_recv_neighbors; i++) {
|
||||
|
||||
for (int i = 0; i < num_recv_neighbors; i++) {
|
||||
MPI_Irecv(elements_to_send + j,
|
||||
send_length[i],
|
||||
MPI_INT,
|
||||
@ -499,14 +438,14 @@ void commPartition(Comm* c, Matrix* A)
|
||||
}
|
||||
|
||||
j = 0;
|
||||
for (i = 0; i < num_recv_neighbors; i++) {
|
||||
|
||||
for (int i = 0; i < num_recv_neighbors; i++) {
|
||||
int start = j;
|
||||
int newlength = 0;
|
||||
|
||||
// Go through list of external elements
|
||||
// until updating processor changes. This is redundant, but
|
||||
// saves us from recording this information.
|
||||
|
||||
while ((j < num_external) && (new_external_processor[j] == recv_list[i])) {
|
||||
|
||||
newlength++;
|
||||
@ -522,39 +461,101 @@ void commPartition(Comm* c, Matrix* A)
|
||||
}
|
||||
|
||||
// receive from each neighbor the global index list of external elements
|
||||
|
||||
for (i = 0; i < num_recv_neighbors; i++) {
|
||||
for (int i = 0; i < num_recv_neighbors; i++) {
|
||||
if (MPI_Wait(request + i, &status)) {
|
||||
cerr << "MPI_Wait error\n" << endl;
|
||||
exit(-1);
|
||||
printf("MPI_Wait error\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
/// replace global indices by local indices ///
|
||||
|
||||
for (i = 0; i < total_to_be_sent; i++)
|
||||
/// replace global indices by local indices
|
||||
for (int i = 0; i < total_to_be_sent; i++) {
|
||||
elements_to_send[i] -= start_row;
|
||||
}
|
||||
|
||||
////////////////
|
||||
// Finish up !!
|
||||
////////////////
|
||||
c->num_send_neighbors = num_send_neighbors;
|
||||
A->nc = A->nc + num_external;
|
||||
|
||||
A->num_send_neighbors = num_send_neighbors;
|
||||
A->local_ncol = A->local_nrow + num_external;
|
||||
// Used in exchange
|
||||
CG_FLOAT* send_buffer = (CG_FLOAT*)allocate(ARRAY_ALIGNMENT,
|
||||
total_to_be_sent * sizeof(CG_FLOAT));
|
||||
c->send_buffer = send_buffer;
|
||||
|
||||
// Used in exchange_externals
|
||||
double* send_buffer = new double[total_to_be_sent];
|
||||
A->send_buffer = send_buffer;
|
||||
free(recv_list);
|
||||
free(new_external);
|
||||
#endif
|
||||
}
|
||||
|
||||
delete[] tmp_buffer;
|
||||
delete[] global_index_offsets;
|
||||
delete[] recv_list;
|
||||
delete[] external_processor;
|
||||
delete[] new_external;
|
||||
delete[] new_external_processor;
|
||||
delete[] request;
|
||||
void commExchange(Comm* c, Matrix* A, double* x)
|
||||
{
|
||||
#ifdef _MPI
|
||||
int num_external = 0;
|
||||
|
||||
return;
|
||||
// Extract Matrix pieces
|
||||
|
||||
int local_nrow = A->nr;
|
||||
int num_neighbors = c->num_send_neighbors;
|
||||
int* recv_length = c->recv_length;
|
||||
int* send_length = c->send_length;
|
||||
int* neighbors = c->neighbors;
|
||||
double* send_buffer = c->send_buffer;
|
||||
int total_to_be_sent = c->total_to_be_sent;
|
||||
int* elements_to_send = c->elements_to_send;
|
||||
|
||||
int rank = c->rank;
|
||||
int size = c->size;
|
||||
MPI_Comm comm = c->comm;
|
||||
|
||||
// first post receives, these are immediate receives
|
||||
// Do not wait for result to come, will do that at the
|
||||
// wait call below.
|
||||
int MPI_MY_TAG = 99;
|
||||
|
||||
MPI_Request request[num_neighbors];
|
||||
|
||||
// Externals are at end of locals
|
||||
double* x_external = (double*)x + local_nrow;
|
||||
|
||||
// Post receives first
|
||||
for (int i = 0; i < num_neighbors; i++) {
|
||||
int n_recv = recv_length[i];
|
||||
MPI_Irecv(x_external,
|
||||
n_recv,
|
||||
MPI_DOUBLE,
|
||||
neighbors[i],
|
||||
MPI_MY_TAG,
|
||||
MPI_COMM_WORLD,
|
||||
request + i);
|
||||
x_external += n_recv;
|
||||
}
|
||||
|
||||
// Fill up send buffer
|
||||
for (int i = 0; i < total_to_be_sent; i++) {
|
||||
send_buffer[i] = x[elements_to_send[i]];
|
||||
}
|
||||
|
||||
// Send to each neighbor
|
||||
for (int i = 0; i < num_neighbors; i++) {
|
||||
int n_send = send_length[i];
|
||||
MPI_Send(send_buffer,
|
||||
n_send,
|
||||
MPI_DOUBLE,
|
||||
neighbors[i],
|
||||
MPI_MY_TAG,
|
||||
MPI_COMM_WORLD);
|
||||
send_buffer += n_send;
|
||||
}
|
||||
|
||||
// Complete the reads issued above
|
||||
MPI_Status status;
|
||||
for (int i = 0; i < num_neighbors; i++) {
|
||||
if (MPI_Wait(request + i, &status)) {
|
||||
printf("MPI_Wait error\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void commPrintConfig(Comm* c)
|
||||
@ -568,20 +569,6 @@ void commPrintConfig(Comm* c)
|
||||
|
||||
for (int i = 0; i < c->size; i++) {
|
||||
if (i == c->rank) {
|
||||
printf("\tRank %d of %d\n", c->rank, c->size);
|
||||
printf("\tNeighbours (bottom, top, left, right): %d %d, %d, %d\n",
|
||||
c->neighbours[BOTTOM],
|
||||
c->neighbours[TOP],
|
||||
c->neighbours[LEFT],
|
||||
c->neighbours[RIGHT]);
|
||||
printf("\tIs boundary:\n");
|
||||
printf("\t\tLEFT: %d\n", commIsBoundary(c, LEFT));
|
||||
printf("\t\tRIGHT: %d\n", commIsBoundary(c, RIGHT));
|
||||
printf("\t\tBOTTOM: %d\n", commIsBoundary(c, BOTTOM));
|
||||
printf("\t\tTOP: %d\n", commIsBoundary(c, TOP));
|
||||
printf("\tCoordinates (i,j) %d %d\n", c->coords[IDIM], c->coords[JDIM]);
|
||||
printf("\tDims (i,j) %d %d\n", c->dims[IDIM], c->dims[JDIM]);
|
||||
printf("\tLocal domain size (i,j) %dx%d\n", c->imaxLocal, c->jmaxLocal);
|
||||
fflush(stdout);
|
||||
}
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
12
src/comm.h
12
src/comm.h
@ -10,6 +10,10 @@
|
||||
|
||||
#include "matrix.h"
|
||||
|
||||
#define MAX_EXTERNAL 100000
|
||||
#define MAX_NUM_MESSAGES 500
|
||||
#define MAX_NUM_NEIGHBOURS MAX_NUM_MESSAGES
|
||||
|
||||
enum op { MAX = 0, SUM };
|
||||
|
||||
typedef struct {
|
||||
@ -24,9 +28,9 @@ typedef struct {
|
||||
int* external_local_index;
|
||||
int total_to_be_sent;
|
||||
int* elements_to_send;
|
||||
int* neighbors;
|
||||
int* recv_length;
|
||||
int* send_length;
|
||||
int neighbors[MAX_NUM_NEIGHBOURS];
|
||||
int recv_length[MAX_NUM_NEIGHBOURS];
|
||||
int send_length[MAX_NUM_NEIGHBOURS];
|
||||
double* send_buffer;
|
||||
#endif
|
||||
} Comm;
|
||||
@ -36,7 +40,7 @@ extern void commInit(Comm* c, int argc, char** argv);
|
||||
extern void commFinalize(Comm* c);
|
||||
extern void commPartition(Comm*, Matrix* m);
|
||||
extern void commPrintConfig(Comm*);
|
||||
extern void commExchange(Comm*, double*);
|
||||
extern void commExchange(Comm* c, Matrix* A, double* x);
|
||||
extern void commReduction(double* v, int op);
|
||||
|
||||
static inline int commIsMaster(Comm* c) { return c->rank == 0; }
|
||||
|
Loading…
Reference in New Issue
Block a user