Partial port of communication code
This commit is contained in:
parent
512e2903c8
commit
9f37fa73a9
493
src/comm.c
493
src/comm.c
@ -14,10 +14,6 @@
|
|||||||
#include "allocate.h"
|
#include "allocate.h"
|
||||||
#include "comm.h"
|
#include "comm.h"
|
||||||
|
|
||||||
#define MAX_EXTERNAL 100000
|
|
||||||
#define MAX_NUM_MESSAGES 500
|
|
||||||
#define MAX_NUM_NEIGHBOURS MAX_NUM_MESSAGES
|
|
||||||
|
|
||||||
// subroutines local to this module
|
// subroutines local to this module
|
||||||
int sizeOfRank(int rank, int size, int N)
|
int sizeOfRank(int rank, int size, int N)
|
||||||
{
|
{
|
||||||
@ -37,6 +33,7 @@ void commReduction(double* v, int op)
|
|||||||
|
|
||||||
void commPartition(Comm* c, Matrix* A)
|
void commPartition(Comm* c, Matrix* A)
|
||||||
{
|
{
|
||||||
|
#ifdef _MPI
|
||||||
int rank = c->rank;
|
int rank = c->rank;
|
||||||
int size = c->size;
|
int size = c->size;
|
||||||
MPI_Comm comm = c->comm;
|
MPI_Comm comm = c->comm;
|
||||||
@ -66,15 +63,12 @@ void commPartition(Comm* c, Matrix* A)
|
|||||||
// - find out which processor owns the value.
|
// - find out which processor owns the value.
|
||||||
// - Set up communication for sparse MV operation.
|
// - Set up communication for sparse MV operation.
|
||||||
|
|
||||||
///////////////////////////////////////////
|
|
||||||
// Scan the indices and transform to local
|
// Scan the indices and transform to local
|
||||||
///////////////////////////////////////////
|
int* externals = (int*)allocate(ARRAY_ALIGNMENT, A->totalNr * sizeof(int));
|
||||||
|
int num_external = 1;
|
||||||
|
|
||||||
int* external_index = (int*)allocate(ARRAY_ALIGNMENT, MAX_EXTERNAL * sizeof(int));
|
int* external_index = (int*)allocate(ARRAY_ALIGNMENT, MAX_EXTERNAL * sizeof(int));
|
||||||
int* externals = (int*)allocate(ARRAY_ALIGNMENT, A->totalNr * sizeof(int));
|
c->external_index = external_index;
|
||||||
int num_external = 1;
|
|
||||||
|
|
||||||
c->external_index = external_index;
|
|
||||||
|
|
||||||
for (int i = 0; i < A->totalNr; i++) {
|
for (int i = 0; i < A->totalNr; i++) {
|
||||||
externals[i] = -1;
|
externals[i] = -1;
|
||||||
@ -95,49 +89,39 @@ void commPartition(Comm* c, Matrix* A)
|
|||||||
// shift local rows to the start
|
// shift local rows to the start
|
||||||
if (start_row <= cur_ind && cur_ind <= stop_row) {
|
if (start_row <= cur_ind && cur_ind <= stop_row) {
|
||||||
col_ind[j] -= start_row;
|
col_ind[j] -= start_row;
|
||||||
} else // Must find out if we have already set up this point
|
} else {
|
||||||
{
|
// Must find out if we have already set up this point
|
||||||
if (externals[cur_ind] == -1) {
|
if (externals[cur_ind] == -1) {
|
||||||
externals[cur_ind] = num_external++;
|
externals[cur_ind] = num_external++;
|
||||||
|
|
||||||
if (num_external <= MAX_EXTERNAL) {
|
if (num_external <= MAX_EXTERNAL) {
|
||||||
external_index[num_external - 1] = cur_ind;
|
external_index[num_external - 1] = cur_ind;
|
||||||
// Mark index as external by negating it
|
// Mark index as external by negating it
|
||||||
ptr_to_inds_in_row[i][j] = -(ptr_to_inds_in_row[i][j] + 1);
|
col_ind[j] = -col_ind[j];
|
||||||
} else {
|
} else {
|
||||||
cerr << "Must increase MAX_EXTERNAL in HPC_Sparse_Matrix.hpp"
|
printf("Must increase MAX_EXTERNAL\n");
|
||||||
<< endl;
|
exit(EXIT_FAILURE);
|
||||||
abort();
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Mark index as external by adding 1 and negating it
|
// Mark index as external by negating it
|
||||||
ptr_to_inds_in_row[i][j] = -(ptr_to_inds_in_row[i][j] + 1);
|
col_ind[j] = -col_ind[j];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
/**************************************************************************
|
||||||
// Go through list of externals to find out which processors must be accessed.
|
Go through list of externals to find out which processors must be accessed.
|
||||||
////////////////////////////////////////////////////////////////////////////
|
**************************************************************************/
|
||||||
|
c->num_external = num_external;
|
||||||
|
int tmp_buffer[size];
|
||||||
|
int global_index_offsets[size];
|
||||||
|
|
||||||
A->num_external = num_external;
|
for (int i = 0; i < size; i++) {
|
||||||
int* tmp_buffer = new int[size]; // Temp buffer space needed below
|
tmp_buffer[i] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
// Build list of global index offset
|
tmp_buffer[rank] = start_row;
|
||||||
|
|
||||||
int* global_index_offsets = new int[size];
|
|
||||||
for (i = 0; i < size; i++)
|
|
||||||
tmp_buffer[i] = 0; // First zero out
|
|
||||||
|
|
||||||
tmp_buffer[rank] = start_row; // This is my start row
|
|
||||||
|
|
||||||
// This call sends the start_row of each ith processor to the ith
|
|
||||||
// entry of global_index_offset on all processors.
|
|
||||||
// Thus, each processor know the range of indices owned by all
|
|
||||||
// other processors.
|
|
||||||
// Note: There might be a better algorithm for doing this, but this
|
|
||||||
// will work...
|
|
||||||
|
|
||||||
MPI_Allreduce(tmp_buffer,
|
MPI_Allreduce(tmp_buffer,
|
||||||
global_index_offsets,
|
global_index_offsets,
|
||||||
@ -147,105 +131,97 @@ void commPartition(Comm* c, Matrix* A)
|
|||||||
MPI_COMM_WORLD);
|
MPI_COMM_WORLD);
|
||||||
|
|
||||||
// Go through list of externals and find the processor that owns each
|
// Go through list of externals and find the processor that owns each
|
||||||
int* external_processor = new int[num_external];
|
int external_processor[num_external];
|
||||||
int* new_external_processor = new int[num_external];
|
|
||||||
|
|
||||||
for (i = 0; i < num_external; i++) {
|
for (int i = 0; i < num_external; i++) {
|
||||||
int cur_ind = external_index[i];
|
int cur_ind = external_index[i];
|
||||||
for (int j = size - 1; j >= 0; j--)
|
for (int j = size - 1; j >= 0; j--) {
|
||||||
if (global_index_offsets[j] <= cur_ind) {
|
if (global_index_offsets[j] <= cur_ind) {
|
||||||
external_processor[i] = j;
|
external_processor[i] = j;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (debug) {
|
|
||||||
t0 = mytimer() - t0;
|
|
||||||
cout << " Time in finding processors phase = " << t0 << endl;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
/*Go through the external elements. For each newly encountered external
|
||||||
// Sift through the external elements. For each newly encountered external
|
point assign it the next index in the local sequence. Then look for other
|
||||||
// point assign it the next index in the sequence. Then look for other
|
external elements who are updated by the same node and assign them the next
|
||||||
// external elements who are update by the same node and assign them the next
|
set of index numbers in the local sequence (ie. elements updated by the same node
|
||||||
// set of index numbers in the sequence (ie. elements updated by the same node
|
have consecutive indices).*/
|
||||||
// have consecutive indices).
|
int* external_local_index = (int*)allocate(ARRAY_ALIGNMENT,
|
||||||
////////////////////////////////////////////////////////////////////////////
|
MAX_EXTERNAL * sizeof(int));
|
||||||
|
c->external_local_index = external_local_index;
|
||||||
if (debug) t0 = mytimer();
|
|
||||||
|
|
||||||
int count = local_nrow;
|
int count = local_nrow;
|
||||||
for (i = 0; i < num_external; i++)
|
|
||||||
external_local_index[i] = -1;
|
|
||||||
|
|
||||||
for (i = 0; i < num_external; i++) {
|
for (int i = 0; i < num_external; i++) {
|
||||||
|
external_local_index[i] = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < num_external; i++) {
|
||||||
if (external_local_index[i] == -1) {
|
if (external_local_index[i] == -1) {
|
||||||
external_local_index[i] = count++;
|
external_local_index[i] = count++;
|
||||||
|
|
||||||
for (j = i + 1; j < num_external; j++) {
|
for (int j = i + 1; j < num_external; j++) {
|
||||||
if (external_processor[j] == external_processor[i])
|
if (external_processor[j] == external_processor[i]) {
|
||||||
external_local_index[j] = count++;
|
external_local_index[j] = count++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (debug) {
|
// map all external ids to the new local index
|
||||||
t0 = mytimer() - t0;
|
CG_UINT* rowPtr = A->rowPtr;
|
||||||
cout << " Time in scanning external indices phase = " << t0 << endl;
|
|
||||||
}
|
|
||||||
if (debug) t0 = mytimer();
|
|
||||||
|
|
||||||
for (i = 0; i < local_nrow; i++) {
|
for (int i = 0; i < local_nrow; i++) {
|
||||||
for (j = 0; j < nnz_in_row[i]; j++) {
|
for (int j = rowPtr[i]; j < rowPtr[i + 1]; j++) {
|
||||||
if (ptr_to_inds_in_row[i][j] < 0) // Change index values of externals
|
if (col_ind[j] < 0) {
|
||||||
{
|
int cur_ind = -col_ind[j] - 1; // FIXME: Offset by 1??
|
||||||
int cur_ind = -ptr_to_inds_in_row[i][j] - 1;
|
col_ind[j] = external_local_index[externals[cur_ind]];
|
||||||
ptr_to_inds_in_row[i][j] = external_local_index[externals[cur_ind]];
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = 0; i < num_external; i++)
|
int new_external_processor[num_external];
|
||||||
|
|
||||||
|
for (int i = 0; i < num_external; i++) {
|
||||||
new_external_processor[i] = 0;
|
new_external_processor[i] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
for (i = 0; i < num_external; i++)
|
// setup map from external id to partition
|
||||||
|
for (int i = 0; i < num_external; i++) {
|
||||||
new_external_processor[external_local_index[i] - local_nrow] =
|
new_external_processor[external_local_index[i] - local_nrow] =
|
||||||
external_processor[i];
|
external_processor[i];
|
||||||
|
|
||||||
if (debug) {
|
|
||||||
t0 = mytimer() - t0;
|
|
||||||
cout << " Time in assigning external indices phase = " << t0 << endl;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (debug_details) {
|
#ifdef VERBOSE
|
||||||
for (i = 0; i < num_external; i++) {
|
for (int i = 0; i < num_external; i++) {
|
||||||
cout << "Processor " << rank << " of " << size << ": external processor[" << i
|
printf("Process %d of %d: external process[%d] = %d\n",
|
||||||
<< "] = " << external_processor[i] << endl;
|
rank,
|
||||||
cout << "Processor " << rank << " of " << size << ": new external processor["
|
size,
|
||||||
<< i << "] = " << new_external_processor[i] << endl;
|
i,
|
||||||
}
|
external_processor[i]);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
/* Count the number of neighbors from which we receive information to update
|
||||||
///
|
our external elements. Additionally, fill the array tmp_neighbors in the
|
||||||
// Count the number of neighbors from which we receive information to update
|
following way:
|
||||||
// our external elements. Additionally, fill the array tmp_neighbors in the
|
tmp_neighbors[i] = 0 ==> No external elements are updated by
|
||||||
// following way:
|
processor i.
|
||||||
// tmp_neighbors[i] = 0 ==> No external elements are updated by
|
tmp_neighbors[i] = x ==> (x-1)/size elements are updated from
|
||||||
// processor i.
|
processor i.*/
|
||||||
// tmp_neighbors[i] = x ==> (x-1)/size elements are updated from
|
|
||||||
// processor i.
|
|
||||||
///
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
t0 = mytimer();
|
int tmp_neighbors[size];
|
||||||
int* tmp_neighbors = new int[size];
|
|
||||||
for (i = 0; i < size; i++)
|
for (int i = 0; i < size; i++) {
|
||||||
tmp_neighbors[i] = 0;
|
tmp_neighbors[i] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
int num_recv_neighbors = 0;
|
int num_recv_neighbors = 0;
|
||||||
int length = 1;
|
int length = 1;
|
||||||
|
|
||||||
for (i = 0; i < num_external; i++) {
|
for (int i = 0; i < num_external; i++) {
|
||||||
if (tmp_neighbors[new_external_processor[i]] == 0) {
|
if (tmp_neighbors[new_external_processor[i]] == 0) {
|
||||||
num_recv_neighbors++;
|
num_recv_neighbors++;
|
||||||
tmp_neighbors[new_external_processor[i]] = 1;
|
tmp_neighbors[new_external_processor[i]] = 1;
|
||||||
@ -253,90 +229,72 @@ void commPartition(Comm* c, Matrix* A)
|
|||||||
tmp_neighbors[new_external_processor[i]] += size;
|
tmp_neighbors[new_external_processor[i]] += size;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// sum over all processors all the tmp_neighbors arrays ///
|
// sum over all processors all the tmp_neighbors arrays
|
||||||
|
|
||||||
MPI_Allreduce(tmp_neighbors, tmp_buffer, size, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
|
MPI_Allreduce(tmp_neighbors, tmp_buffer, size, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
|
||||||
|
|
||||||
/// decode the combined 'tmp_neighbors' (stored in tmp_buffer)
|
/* decode the combined 'tmp_neighbors' (stored in tmp_buffer) array from all the
|
||||||
// array from all the processors
|
* processors */
|
||||||
|
|
||||||
int num_send_neighbors = tmp_buffer[rank] % size;
|
int num_send_neighbors = tmp_buffer[rank] % size;
|
||||||
|
|
||||||
/// decode 'tmp_buffer[rank] to deduce total number of elements
|
/* decode 'tmp_buffer[rank] to deduce total number of elements we must send */
|
||||||
// we must send
|
|
||||||
|
|
||||||
int total_to_be_sent = (tmp_buffer[rank] - num_send_neighbors) / size;
|
int total_to_be_sent = (tmp_buffer[rank] - num_send_neighbors) / size;
|
||||||
|
|
||||||
//
|
/* Check to see if we have enough workspace allocated. This could be
|
||||||
// Check to see if we have enough workspace allocated. This could be
|
dynamically modified, but let's keep it simple for now...*/
|
||||||
// dynamically modified, but let's keep it simple for now...
|
|
||||||
//
|
|
||||||
|
|
||||||
if (num_send_neighbors > MAX_NUM_MESSAGES) {
|
if (num_send_neighbors > MAX_NUM_MESSAGES) {
|
||||||
cerr << "Must increase MAX_NUM_MESSAGES in HPC_Sparse_Matrix.hpp" << endl;
|
printf("Must increase MAX_NUM_MESSAGES. Must be at least %d\n",
|
||||||
cerr << "Must be at least " << num_send_neighbors << endl;
|
num_send_neighbors);
|
||||||
abort();
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (total_to_be_sent > MAX_EXTERNAL) {
|
if (total_to_be_sent > MAX_EXTERNAL) {
|
||||||
cerr << "Must increase MAX_EXTERNAL in HPC_Sparse_Matrix.hpp" << endl;
|
printf("Must increase MAX_EXTERNAL. Must be at least %d\n", total_to_be_sent);
|
||||||
cerr << "Must be at least " << total_to_be_sent << endl;
|
exit(EXIT_FAILURE);
|
||||||
abort();
|
|
||||||
}
|
}
|
||||||
delete[] tmp_neighbors;
|
|
||||||
|
|
||||||
if (debug) {
|
#ifdef VERBOSE
|
||||||
t0 = mytimer() - t0;
|
cout << "Processor " << rank << " of " << size
|
||||||
cout << " Time in finding neighbors phase = " << t0 << endl;
|
<< ": Number of send neighbors = " << num_send_neighbors << endl;
|
||||||
}
|
|
||||||
if (debug)
|
|
||||||
cout << "Processor " << rank << " of " << size
|
|
||||||
<< ": Number of send neighbors = " << num_send_neighbors << endl;
|
|
||||||
|
|
||||||
if (debug)
|
cout << "Processor " << rank << " of " << size
|
||||||
cout << "Processor " << rank << " of " << size
|
<< ": Number of receive neighbors = " << num_recv_neighbors << endl;
|
||||||
<< ": Number of receive neighbors = " << num_recv_neighbors << endl;
|
|
||||||
|
|
||||||
if (debug)
|
cout << "Processor " << rank << " of " << size
|
||||||
cout << "Processor " << rank << " of " << size
|
<< ": Total number of elements to send = " << total_to_be_sent << endl;
|
||||||
<< ": Total number of elements to send = " << total_to_be_sent << endl;
|
|
||||||
|
|
||||||
if (debug) MPI_Barrier(MPI_COMM_WORLD);
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
#endif
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////
|
/* Make a list of the neighbors that will send information to update our
|
||||||
///
|
external elements (in the order that we will receive this information).*/
|
||||||
// Make a list of the neighbors that will send information to update our
|
int* recv_list = allocate(ARRAY_ALIGNMENT, MAX_EXTERNAL * sizeof(int));
|
||||||
// external elements (in the order that we will receive this information).
|
|
||||||
///
|
|
||||||
/////////////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
int* recv_list = new int[MAX_EXTERNAL];
|
// FIXME: Create local scope
|
||||||
|
int j = 0;
|
||||||
j = 0;
|
|
||||||
recv_list[j++] = new_external_processor[0];
|
recv_list[j++] = new_external_processor[0];
|
||||||
for (i = 1; i < num_external; i++) {
|
|
||||||
|
for (int i = 1; i < num_external; i++) {
|
||||||
if (new_external_processor[i - 1] != new_external_processor[i]) {
|
if (new_external_processor[i - 1] != new_external_processor[i]) {
|
||||||
recv_list[j++] = new_external_processor[i];
|
recv_list[j++] = new_external_processor[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
// Ensure that all the neighbors we expect to receive from also send to us
|
||||||
// Send a 0 length message to each of our recv neighbors
|
// Send a 0 length message to each of our recv neighbors
|
||||||
//
|
int send_list[num_send_neighbors];
|
||||||
|
|
||||||
int* send_list = new int[num_send_neighbors];
|
for (int i = 0; i < num_send_neighbors; i++) {
|
||||||
for (i = 0; i < num_send_neighbors; i++)
|
|
||||||
send_list[i] = 0;
|
send_list[i] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
//
|
|
||||||
// first post receives, these are immediate receives
|
// first post receives, these are immediate receives
|
||||||
// Do not wait for result to come, will do that at the
|
// Do not wait for result to come, will do that at the
|
||||||
// wait call below.
|
// wait call below.
|
||||||
//
|
|
||||||
int MPI_MY_TAG = 99;
|
int MPI_MY_TAG = 99;
|
||||||
|
|
||||||
MPI_Request* request = new MPI_Request[MAX_NUM_MESSAGES];
|
MPI_Request request[MAX_NUM_MESSAGES];
|
||||||
for (i = 0; i < num_send_neighbors; i++) {
|
|
||||||
|
for (int i = 0; i < num_send_neighbors; i++) {
|
||||||
MPI_Irecv(tmp_buffer + i,
|
MPI_Irecv(tmp_buffer + i,
|
||||||
1,
|
1,
|
||||||
MPI_INT,
|
MPI_INT,
|
||||||
@ -347,88 +305,77 @@ void commPartition(Comm* c, Matrix* A)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// send messages
|
// send messages
|
||||||
|
for (int i = 0; i < num_recv_neighbors; i++) {
|
||||||
for (i = 0; i < num_recv_neighbors; i++)
|
|
||||||
MPI_Send(tmp_buffer + i, 1, MPI_INT, recv_list[i], MPI_MY_TAG, MPI_COMM_WORLD);
|
MPI_Send(tmp_buffer + i, 1, MPI_INT, recv_list[i], MPI_MY_TAG, MPI_COMM_WORLD);
|
||||||
///
|
}
|
||||||
// Receive message from each send neighbor to construct 'send_list'.
|
|
||||||
///
|
|
||||||
|
|
||||||
|
// Receive message from each send neighbor to construct 'send_list'.
|
||||||
MPI_Status status;
|
MPI_Status status;
|
||||||
for (i = 0; i < num_send_neighbors; i++) {
|
for (int i = 0; i < num_send_neighbors; i++) {
|
||||||
if (MPI_Wait(request + i, &status)) {
|
if (MPI_Wait(request + i, &status)) {
|
||||||
cerr << "MPI_Wait error\n" << endl;
|
printf("MPI_Wait error\n");
|
||||||
exit(-1);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
send_list[i] = status.MPI_SOURCE;
|
send_list[i] = status.MPI_SOURCE;
|
||||||
}
|
}
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////
|
/* Compare the two lists. In most cases they should be the same.
|
||||||
///
|
|
||||||
// Compare the two lists. In most cases they should be the same.
|
|
||||||
// However, if they are not then add new entries to the recv list
|
// However, if they are not then add new entries to the recv list
|
||||||
// that are in the send list (but not already in the recv list).
|
// that are in the send list (but not already in the recv list).
|
||||||
///
|
WHY!! This ensures that the sendlist is equal to the sendlist
|
||||||
/////////////////////////////////////////////////////////////////////////
|
But why is this required? -> Just One neighbour list??*/
|
||||||
|
for (int j = 0; j < num_send_neighbors; j++) {
|
||||||
for (j = 0; j < num_send_neighbors; j++) {
|
|
||||||
int found = 0;
|
int found = 0;
|
||||||
for (i = 0; i < num_recv_neighbors; i++) {
|
for (int i = 0; i < num_recv_neighbors; i++) {
|
||||||
if (recv_list[i] == send_list[j]) found = 1;
|
if (recv_list[i] == send_list[j]) found = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (found == 0) {
|
if (found == 0) {
|
||||||
if (debug)
|
#ifdef VERBOSE
|
||||||
cout << "Processor " << rank << " of " << size << ": recv_list["
|
printf("Process %d of %d: recv_list[%d] = %d\n",
|
||||||
<< num_recv_neighbors << "] = " << send_list[j] << endl;
|
rank,
|
||||||
|
size,
|
||||||
|
num_recv_neighbors,
|
||||||
|
send_list[i]);
|
||||||
|
#endif
|
||||||
recv_list[num_recv_neighbors] = send_list[j];
|
recv_list[num_recv_neighbors] = send_list[j];
|
||||||
(num_recv_neighbors)++;
|
(num_recv_neighbors)++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
delete[] send_list;
|
|
||||||
num_send_neighbors = num_recv_neighbors;
|
num_send_neighbors = num_recv_neighbors;
|
||||||
|
|
||||||
if (num_send_neighbors > MAX_NUM_MESSAGES) {
|
if (num_send_neighbors > MAX_NUM_MESSAGES) {
|
||||||
cerr << "Must increase MAX_EXTERNAL in HPC_Sparse_Matrix.hpp" << endl;
|
printf("Must increase MAX_EXTERNAL\n");
|
||||||
abort();
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////
|
// Start filling communication setup
|
||||||
/// Start filling HPC_Sparse_Matrix struct
|
|
||||||
/////////////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
A->total_to_be_sent = total_to_be_sent;
|
|
||||||
int* elements_to_send = new int[total_to_be_sent];
|
|
||||||
A->elements_to_send = elements_to_send;
|
|
||||||
|
|
||||||
for (i = 0; i < total_to_be_sent; i++)
|
|
||||||
elements_to_send[i] = 0;
|
|
||||||
|
|
||||||
//
|
|
||||||
// Create 'new_external' which explicitly put the external elements in the
|
// Create 'new_external' which explicitly put the external elements in the
|
||||||
// order given by 'external_local_index'
|
// order given by 'external_local_index'
|
||||||
//
|
c->total_to_be_sent = total_to_be_sent;
|
||||||
|
int* elements_to_send = (int*)allocate(ARRAY_ALIGNMENT,
|
||||||
|
total_to_be_sent * sizeof(int));
|
||||||
|
c->elements_to_send = elements_to_send;
|
||||||
|
|
||||||
int* new_external = new int[num_external];
|
for (int i = 0; i < total_to_be_sent; i++) {
|
||||||
for (i = 0; i < num_external; i++) {
|
elements_to_send[i] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create 'new_external' which explicitly put the external elements in the
|
||||||
|
// order given by 'external_local_index'
|
||||||
|
int* new_external = (int*)allocate(ARRAY_ALIGNMENT, num_external * sizeof(int));
|
||||||
|
|
||||||
|
for (int i = 0; i < num_external; i++) {
|
||||||
new_external[external_local_index[i] - local_nrow] = external_index[i];
|
new_external[external_local_index[i] - local_nrow] = external_index[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////////
|
|
||||||
//
|
|
||||||
// Send each processor the global index list of the external elements in the
|
// Send each processor the global index list of the external elements in the
|
||||||
// order that I will want to receive them when updating my external elements
|
// order that I will want to receive them when updating my external elements
|
||||||
//
|
int lengths[num_recv_neighbors];
|
||||||
/////////////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
int* lengths = new int[num_recv_neighbors];
|
|
||||||
|
|
||||||
MPI_MY_TAG++;
|
MPI_MY_TAG++;
|
||||||
|
|
||||||
// First post receives
|
// First post receives
|
||||||
|
for (int i = 0; i < num_recv_neighbors; i++) {
|
||||||
for (i = 0; i < num_recv_neighbors; i++) {
|
|
||||||
int partner = recv_list[i];
|
int partner = recv_list[i];
|
||||||
MPI_Irecv(lengths + i,
|
MPI_Irecv(lengths + i,
|
||||||
1,
|
1,
|
||||||
@ -439,22 +386,18 @@ void commPartition(Comm* c, Matrix* A)
|
|||||||
request + i);
|
request + i);
|
||||||
}
|
}
|
||||||
|
|
||||||
int* neighbors = new int[MAX_NUM_NEIGHBOURS];
|
int* neighbors = c->neighbors;
|
||||||
int* recv_length = new int[MAX_NUM_NEIGHBOURS];
|
int* recv_length = c->recv_length;
|
||||||
int* send_length = new int[MAX_NUM_NEIGHBOURS];
|
int* send_length = c->send_length;
|
||||||
|
|
||||||
A->neighbors = neighbors;
|
|
||||||
A->recv_length = recv_length;
|
|
||||||
A->send_length = send_length;
|
|
||||||
|
|
||||||
j = 0;
|
j = 0;
|
||||||
for (i = 0; i < num_recv_neighbors; i++) {
|
|
||||||
|
for (int i = 0; i < num_recv_neighbors; i++) {
|
||||||
int start = j;
|
int start = j;
|
||||||
int newlength = 0;
|
int newlength = 0;
|
||||||
|
|
||||||
// go through list of external elements until updating
|
// go through list of external elements until updating
|
||||||
// processor changes
|
// processor changes
|
||||||
|
|
||||||
while ((j < num_external) && (new_external_processor[j] == recv_list[i])) {
|
while ((j < num_external) && (new_external_processor[j] == recv_list[i])) {
|
||||||
newlength++;
|
newlength++;
|
||||||
j++;
|
j++;
|
||||||
@ -469,25 +412,21 @@ void commPartition(Comm* c, Matrix* A)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Complete the receives of the number of externals
|
// Complete the receives of the number of externals
|
||||||
|
for (int i = 0; i < num_recv_neighbors; i++) {
|
||||||
for (i = 0; i < num_recv_neighbors; i++) {
|
|
||||||
if (MPI_Wait(request + i, &status)) {
|
if (MPI_Wait(request + i, &status)) {
|
||||||
cerr << "MPI_Wait error\n" << endl;
|
printf("MPI_Wait error\n");
|
||||||
exit(-1);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
send_length[i] = lengths[i];
|
send_length[i] = lengths[i];
|
||||||
}
|
}
|
||||||
delete[] lengths;
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////
|
|
||||||
// Build "elements_to_send" list. These are the x elements I own
|
// Build "elements_to_send" list. These are the x elements I own
|
||||||
// that need to be sent to other processors.
|
// that need to be sent to other processors.
|
||||||
///////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
MPI_MY_TAG++;
|
MPI_MY_TAG++;
|
||||||
|
|
||||||
j = 0;
|
j = 0;
|
||||||
for (i = 0; i < num_recv_neighbors; i++) {
|
|
||||||
|
for (int i = 0; i < num_recv_neighbors; i++) {
|
||||||
MPI_Irecv(elements_to_send + j,
|
MPI_Irecv(elements_to_send + j,
|
||||||
send_length[i],
|
send_length[i],
|
||||||
MPI_INT,
|
MPI_INT,
|
||||||
@ -499,14 +438,14 @@ void commPartition(Comm* c, Matrix* A)
|
|||||||
}
|
}
|
||||||
|
|
||||||
j = 0;
|
j = 0;
|
||||||
for (i = 0; i < num_recv_neighbors; i++) {
|
|
||||||
|
for (int i = 0; i < num_recv_neighbors; i++) {
|
||||||
int start = j;
|
int start = j;
|
||||||
int newlength = 0;
|
int newlength = 0;
|
||||||
|
|
||||||
// Go through list of external elements
|
// Go through list of external elements
|
||||||
// until updating processor changes. This is redundant, but
|
// until updating processor changes. This is redundant, but
|
||||||
// saves us from recording this information.
|
// saves us from recording this information.
|
||||||
|
|
||||||
while ((j < num_external) && (new_external_processor[j] == recv_list[i])) {
|
while ((j < num_external) && (new_external_processor[j] == recv_list[i])) {
|
||||||
|
|
||||||
newlength++;
|
newlength++;
|
||||||
@ -522,39 +461,101 @@ void commPartition(Comm* c, Matrix* A)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// receive from each neighbor the global index list of external elements
|
// receive from each neighbor the global index list of external elements
|
||||||
|
for (int i = 0; i < num_recv_neighbors; i++) {
|
||||||
for (i = 0; i < num_recv_neighbors; i++) {
|
|
||||||
if (MPI_Wait(request + i, &status)) {
|
if (MPI_Wait(request + i, &status)) {
|
||||||
cerr << "MPI_Wait error\n" << endl;
|
printf("MPI_Wait error\n");
|
||||||
exit(-1);
|
exit(EXIT_FAILURE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// replace global indices by local indices ///
|
/// replace global indices by local indices
|
||||||
|
for (int i = 0; i < total_to_be_sent; i++) {
|
||||||
for (i = 0; i < total_to_be_sent; i++)
|
|
||||||
elements_to_send[i] -= start_row;
|
elements_to_send[i] -= start_row;
|
||||||
|
}
|
||||||
|
|
||||||
////////////////
|
|
||||||
// Finish up !!
|
// Finish up !!
|
||||||
////////////////
|
c->num_send_neighbors = num_send_neighbors;
|
||||||
|
A->nc = A->nc + num_external;
|
||||||
|
|
||||||
A->num_send_neighbors = num_send_neighbors;
|
// Used in exchange
|
||||||
A->local_ncol = A->local_nrow + num_external;
|
CG_FLOAT* send_buffer = (CG_FLOAT*)allocate(ARRAY_ALIGNMENT,
|
||||||
|
total_to_be_sent * sizeof(CG_FLOAT));
|
||||||
|
c->send_buffer = send_buffer;
|
||||||
|
|
||||||
// Used in exchange_externals
|
free(recv_list);
|
||||||
double* send_buffer = new double[total_to_be_sent];
|
free(new_external);
|
||||||
A->send_buffer = send_buffer;
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
delete[] tmp_buffer;
|
void commExchange(Comm* c, Matrix* A, double* x)
|
||||||
delete[] global_index_offsets;
|
{
|
||||||
delete[] recv_list;
|
#ifdef _MPI
|
||||||
delete[] external_processor;
|
int num_external = 0;
|
||||||
delete[] new_external;
|
|
||||||
delete[] new_external_processor;
|
|
||||||
delete[] request;
|
|
||||||
|
|
||||||
return;
|
// Extract Matrix pieces
|
||||||
|
|
||||||
|
int local_nrow = A->nr;
|
||||||
|
int num_neighbors = c->num_send_neighbors;
|
||||||
|
int* recv_length = c->recv_length;
|
||||||
|
int* send_length = c->send_length;
|
||||||
|
int* neighbors = c->neighbors;
|
||||||
|
double* send_buffer = c->send_buffer;
|
||||||
|
int total_to_be_sent = c->total_to_be_sent;
|
||||||
|
int* elements_to_send = c->elements_to_send;
|
||||||
|
|
||||||
|
int rank = c->rank;
|
||||||
|
int size = c->size;
|
||||||
|
MPI_Comm comm = c->comm;
|
||||||
|
|
||||||
|
// first post receives, these are immediate receives
|
||||||
|
// Do not wait for result to come, will do that at the
|
||||||
|
// wait call below.
|
||||||
|
int MPI_MY_TAG = 99;
|
||||||
|
|
||||||
|
MPI_Request request[num_neighbors];
|
||||||
|
|
||||||
|
// Externals are at end of locals
|
||||||
|
double* x_external = (double*)x + local_nrow;
|
||||||
|
|
||||||
|
// Post receives first
|
||||||
|
for (int i = 0; i < num_neighbors; i++) {
|
||||||
|
int n_recv = recv_length[i];
|
||||||
|
MPI_Irecv(x_external,
|
||||||
|
n_recv,
|
||||||
|
MPI_DOUBLE,
|
||||||
|
neighbors[i],
|
||||||
|
MPI_MY_TAG,
|
||||||
|
MPI_COMM_WORLD,
|
||||||
|
request + i);
|
||||||
|
x_external += n_recv;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fill up send buffer
|
||||||
|
for (int i = 0; i < total_to_be_sent; i++) {
|
||||||
|
send_buffer[i] = x[elements_to_send[i]];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Send to each neighbor
|
||||||
|
for (int i = 0; i < num_neighbors; i++) {
|
||||||
|
int n_send = send_length[i];
|
||||||
|
MPI_Send(send_buffer,
|
||||||
|
n_send,
|
||||||
|
MPI_DOUBLE,
|
||||||
|
neighbors[i],
|
||||||
|
MPI_MY_TAG,
|
||||||
|
MPI_COMM_WORLD);
|
||||||
|
send_buffer += n_send;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Complete the reads issued above
|
||||||
|
MPI_Status status;
|
||||||
|
for (int i = 0; i < num_neighbors; i++) {
|
||||||
|
if (MPI_Wait(request + i, &status)) {
|
||||||
|
printf("MPI_Wait error\n");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void commPrintConfig(Comm* c)
|
void commPrintConfig(Comm* c)
|
||||||
@ -568,20 +569,6 @@ void commPrintConfig(Comm* c)
|
|||||||
|
|
||||||
for (int i = 0; i < c->size; i++) {
|
for (int i = 0; i < c->size; i++) {
|
||||||
if (i == c->rank) {
|
if (i == c->rank) {
|
||||||
printf("\tRank %d of %d\n", c->rank, c->size);
|
|
||||||
printf("\tNeighbours (bottom, top, left, right): %d %d, %d, %d\n",
|
|
||||||
c->neighbours[BOTTOM],
|
|
||||||
c->neighbours[TOP],
|
|
||||||
c->neighbours[LEFT],
|
|
||||||
c->neighbours[RIGHT]);
|
|
||||||
printf("\tIs boundary:\n");
|
|
||||||
printf("\t\tLEFT: %d\n", commIsBoundary(c, LEFT));
|
|
||||||
printf("\t\tRIGHT: %d\n", commIsBoundary(c, RIGHT));
|
|
||||||
printf("\t\tBOTTOM: %d\n", commIsBoundary(c, BOTTOM));
|
|
||||||
printf("\t\tTOP: %d\n", commIsBoundary(c, TOP));
|
|
||||||
printf("\tCoordinates (i,j) %d %d\n", c->coords[IDIM], c->coords[JDIM]);
|
|
||||||
printf("\tDims (i,j) %d %d\n", c->dims[IDIM], c->dims[JDIM]);
|
|
||||||
printf("\tLocal domain size (i,j) %dx%d\n", c->imaxLocal, c->jmaxLocal);
|
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
12
src/comm.h
12
src/comm.h
@ -10,6 +10,10 @@
|
|||||||
|
|
||||||
#include "matrix.h"
|
#include "matrix.h"
|
||||||
|
|
||||||
|
#define MAX_EXTERNAL 100000
|
||||||
|
#define MAX_NUM_MESSAGES 500
|
||||||
|
#define MAX_NUM_NEIGHBOURS MAX_NUM_MESSAGES
|
||||||
|
|
||||||
enum op { MAX = 0, SUM };
|
enum op { MAX = 0, SUM };
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
@ -24,9 +28,9 @@ typedef struct {
|
|||||||
int* external_local_index;
|
int* external_local_index;
|
||||||
int total_to_be_sent;
|
int total_to_be_sent;
|
||||||
int* elements_to_send;
|
int* elements_to_send;
|
||||||
int* neighbors;
|
int neighbors[MAX_NUM_NEIGHBOURS];
|
||||||
int* recv_length;
|
int recv_length[MAX_NUM_NEIGHBOURS];
|
||||||
int* send_length;
|
int send_length[MAX_NUM_NEIGHBOURS];
|
||||||
double* send_buffer;
|
double* send_buffer;
|
||||||
#endif
|
#endif
|
||||||
} Comm;
|
} Comm;
|
||||||
@ -36,7 +40,7 @@ extern void commInit(Comm* c, int argc, char** argv);
|
|||||||
extern void commFinalize(Comm* c);
|
extern void commFinalize(Comm* c);
|
||||||
extern void commPartition(Comm*, Matrix* m);
|
extern void commPartition(Comm*, Matrix* m);
|
||||||
extern void commPrintConfig(Comm*);
|
extern void commPrintConfig(Comm*);
|
||||||
extern void commExchange(Comm*, double*);
|
extern void commExchange(Comm* c, Matrix* A, double* x);
|
||||||
extern void commReduction(double* v, int op);
|
extern void commReduction(double* v, int op);
|
||||||
|
|
||||||
static inline int commIsMaster(Comm* c) { return c->rank == 0; }
|
static inline int commIsMaster(Comm* c) { return c->rank == 0; }
|
||||||
|
Loading…
Reference in New Issue
Block a user