Partial port of communication code

This commit is contained in:
Jan Eitzinger 2025-01-17 20:30:18 +01:00
parent 512e2903c8
commit 9f37fa73a9
Signed by: moebiusband
GPG Key ID: 2574BA29B90D6DD5
2 changed files with 248 additions and 257 deletions

View File

@ -14,10 +14,6 @@
#include "allocate.h"
#include "comm.h"
#define MAX_EXTERNAL 100000
#define MAX_NUM_MESSAGES 500
#define MAX_NUM_NEIGHBOURS MAX_NUM_MESSAGES
// subroutines local to this module
int sizeOfRank(int rank, int size, int N)
{
@ -37,6 +33,7 @@ void commReduction(double* v, int op)
void commPartition(Comm* c, Matrix* A)
{
#ifdef _MPI
int rank = c->rank;
int size = c->size;
MPI_Comm comm = c->comm;
@ -66,15 +63,12 @@ void commPartition(Comm* c, Matrix* A)
// - find out which processor owns the value.
// - Set up communication for sparse MV operation.
///////////////////////////////////////////
// Scan the indices and transform to local
///////////////////////////////////////////
int* externals = (int*)allocate(ARRAY_ALIGNMENT, A->totalNr * sizeof(int));
int num_external = 1;
int* external_index = (int*)allocate(ARRAY_ALIGNMENT, MAX_EXTERNAL * sizeof(int));
int* externals = (int*)allocate(ARRAY_ALIGNMENT, A->totalNr * sizeof(int));
int num_external = 1;
c->external_index = external_index;
c->external_index = external_index;
for (int i = 0; i < A->totalNr; i++) {
externals[i] = -1;
@ -95,49 +89,39 @@ void commPartition(Comm* c, Matrix* A)
// shift local rows to the start
if (start_row <= cur_ind && cur_ind <= stop_row) {
col_ind[j] -= start_row;
} else // Must find out if we have already set up this point
{
} else {
// Must find out if we have already set up this point
if (externals[cur_ind] == -1) {
externals[cur_ind] = num_external++;
if (num_external <= MAX_EXTERNAL) {
external_index[num_external - 1] = cur_ind;
// Mark index as external by negating it
ptr_to_inds_in_row[i][j] = -(ptr_to_inds_in_row[i][j] + 1);
col_ind[j] = -col_ind[j];
} else {
cerr << "Must increase MAX_EXTERNAL in HPC_Sparse_Matrix.hpp"
<< endl;
abort();
printf("Must increase MAX_EXTERNAL\n");
exit(EXIT_FAILURE);
}
} else {
// Mark index as external by adding 1 and negating it
ptr_to_inds_in_row[i][j] = -(ptr_to_inds_in_row[i][j] + 1);
// Mark index as external by negating it
col_ind[j] = -col_ind[j];
}
}
}
}
////////////////////////////////////////////////////////////////////////////
// Go through list of externals to find out which processors must be accessed.
////////////////////////////////////////////////////////////////////////////
/**************************************************************************
Go through list of externals to find out which processors must be accessed.
**************************************************************************/
c->num_external = num_external;
int tmp_buffer[size];
int global_index_offsets[size];
A->num_external = num_external;
int* tmp_buffer = new int[size]; // Temp buffer space needed below
for (int i = 0; i < size; i++) {
tmp_buffer[i] = 0;
}
// Build list of global index offset
int* global_index_offsets = new int[size];
for (i = 0; i < size; i++)
tmp_buffer[i] = 0; // First zero out
tmp_buffer[rank] = start_row; // This is my start row
// This call sends the start_row of each ith processor to the ith
// entry of global_index_offset on all processors.
// Thus, each processor know the range of indices owned by all
// other processors.
// Note: There might be a better algorithm for doing this, but this
// will work...
tmp_buffer[rank] = start_row;
MPI_Allreduce(tmp_buffer,
global_index_offsets,
@ -147,105 +131,97 @@ void commPartition(Comm* c, Matrix* A)
MPI_COMM_WORLD);
// Go through list of externals and find the processor that owns each
int* external_processor = new int[num_external];
int* new_external_processor = new int[num_external];
int external_processor[num_external];
for (i = 0; i < num_external; i++) {
for (int i = 0; i < num_external; i++) {
int cur_ind = external_index[i];
for (int j = size - 1; j >= 0; j--)
for (int j = size - 1; j >= 0; j--) {
if (global_index_offsets[j] <= cur_ind) {
external_processor[i] = j;
break;
}
}
if (debug) {
t0 = mytimer() - t0;
cout << " Time in finding processors phase = " << t0 << endl;
}
}
////////////////////////////////////////////////////////////////////////////
// Sift through the external elements. For each newly encountered external
// point assign it the next index in the sequence. Then look for other
// external elements who are update by the same node and assign them the next
// set of index numbers in the sequence (ie. elements updated by the same node
// have consecutive indices).
////////////////////////////////////////////////////////////////////////////
if (debug) t0 = mytimer();
/*Go through the external elements. For each newly encountered external
point assign it the next index in the local sequence. Then look for other
external elements who are updated by the same node and assign them the next
set of index numbers in the local sequence (ie. elements updated by the same node
have consecutive indices).*/
int* external_local_index = (int*)allocate(ARRAY_ALIGNMENT,
MAX_EXTERNAL * sizeof(int));
c->external_local_index = external_local_index;
int count = local_nrow;
for (i = 0; i < num_external; i++)
external_local_index[i] = -1;
for (i = 0; i < num_external; i++) {
for (int i = 0; i < num_external; i++) {
external_local_index[i] = -1;
}
for (int i = 0; i < num_external; i++) {
if (external_local_index[i] == -1) {
external_local_index[i] = count++;
for (j = i + 1; j < num_external; j++) {
if (external_processor[j] == external_processor[i])
for (int j = i + 1; j < num_external; j++) {
if (external_processor[j] == external_processor[i]) {
external_local_index[j] = count++;
}
}
}
}
if (debug) {
t0 = mytimer() - t0;
cout << " Time in scanning external indices phase = " << t0 << endl;
}
if (debug) t0 = mytimer();
// map all external ids to the new local index
CG_UINT* rowPtr = A->rowPtr;
for (i = 0; i < local_nrow; i++) {
for (j = 0; j < nnz_in_row[i]; j++) {
if (ptr_to_inds_in_row[i][j] < 0) // Change index values of externals
{
int cur_ind = -ptr_to_inds_in_row[i][j] - 1;
ptr_to_inds_in_row[i][j] = external_local_index[externals[cur_ind]];
for (int i = 0; i < local_nrow; i++) {
for (int j = rowPtr[i]; j < rowPtr[i + 1]; j++) {
if (col_ind[j] < 0) {
int cur_ind = -col_ind[j] - 1; // FIXME: Offset by 1??
col_ind[j] = external_local_index[externals[cur_ind]];
}
}
}
for (i = 0; i < num_external; i++)
int new_external_processor[num_external];
for (int i = 0; i < num_external; i++) {
new_external_processor[i] = 0;
}
for (i = 0; i < num_external; i++)
// setup map from external id to partition
for (int i = 0; i < num_external; i++) {
new_external_processor[external_local_index[i] - local_nrow] =
external_processor[i];
if (debug) {
t0 = mytimer() - t0;
cout << " Time in assigning external indices phase = " << t0 << endl;
}
if (debug_details) {
for (i = 0; i < num_external; i++) {
cout << "Processor " << rank << " of " << size << ": external processor[" << i
<< "] = " << external_processor[i] << endl;
cout << "Processor " << rank << " of " << size << ": new external processor["
<< i << "] = " << new_external_processor[i] << endl;
}
#ifdef VERBOSE
for (int i = 0; i < num_external; i++) {
printf("Process %d of %d: external process[%d] = %d\n",
rank,
size,
i,
external_processor[i]);
}
#endif
////////////////////////////////////////////////////////////////////////////
///
// Count the number of neighbors from which we receive information to update
// our external elements. Additionally, fill the array tmp_neighbors in the
// following way:
// tmp_neighbors[i] = 0 ==> No external elements are updated by
// processor i.
// tmp_neighbors[i] = x ==> (x-1)/size elements are updated from
// processor i.
///
////////////////////////////////////////////////////////////////////////////
/* Count the number of neighbors from which we receive information to update
our external elements. Additionally, fill the array tmp_neighbors in the
following way:
tmp_neighbors[i] = 0 ==> No external elements are updated by
processor i.
tmp_neighbors[i] = x ==> (x-1)/size elements are updated from
processor i.*/
t0 = mytimer();
int* tmp_neighbors = new int[size];
for (i = 0; i < size; i++)
int tmp_neighbors[size];
for (int i = 0; i < size; i++) {
tmp_neighbors[i] = 0;
}
int num_recv_neighbors = 0;
int length = 1;
for (i = 0; i < num_external; i++) {
for (int i = 0; i < num_external; i++) {
if (tmp_neighbors[new_external_processor[i]] == 0) {
num_recv_neighbors++;
tmp_neighbors[new_external_processor[i]] = 1;
@ -253,90 +229,72 @@ void commPartition(Comm* c, Matrix* A)
tmp_neighbors[new_external_processor[i]] += size;
}
/// sum over all processors all the tmp_neighbors arrays ///
// sum over all processors all the tmp_neighbors arrays
MPI_Allreduce(tmp_neighbors, tmp_buffer, size, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
/// decode the combined 'tmp_neighbors' (stored in tmp_buffer)
// array from all the processors
/* decode the combined 'tmp_neighbors' (stored in tmp_buffer) array from all the
* processors */
int num_send_neighbors = tmp_buffer[rank] % size;
/// decode 'tmp_buffer[rank] to deduce total number of elements
// we must send
/* decode 'tmp_buffer[rank] to deduce total number of elements we must send */
int total_to_be_sent = (tmp_buffer[rank] - num_send_neighbors) / size;
//
// Check to see if we have enough workspace allocated. This could be
// dynamically modified, but let's keep it simple for now...
//
/* Check to see if we have enough workspace allocated. This could be
dynamically modified, but let's keep it simple for now...*/
if (num_send_neighbors > MAX_NUM_MESSAGES) {
cerr << "Must increase MAX_NUM_MESSAGES in HPC_Sparse_Matrix.hpp" << endl;
cerr << "Must be at least " << num_send_neighbors << endl;
abort();
printf("Must increase MAX_NUM_MESSAGES. Must be at least %d\n",
num_send_neighbors);
exit(EXIT_FAILURE);
}
if (total_to_be_sent > MAX_EXTERNAL) {
cerr << "Must increase MAX_EXTERNAL in HPC_Sparse_Matrix.hpp" << endl;
cerr << "Must be at least " << total_to_be_sent << endl;
abort();
printf("Must increase MAX_EXTERNAL. Must be at least %d\n", total_to_be_sent);
exit(EXIT_FAILURE);
}
delete[] tmp_neighbors;
if (debug) {
t0 = mytimer() - t0;
cout << " Time in finding neighbors phase = " << t0 << endl;
}
if (debug)
cout << "Processor " << rank << " of " << size
<< ": Number of send neighbors = " << num_send_neighbors << endl;
#ifdef VERBOSE
cout << "Processor " << rank << " of " << size
<< ": Number of send neighbors = " << num_send_neighbors << endl;
if (debug)
cout << "Processor " << rank << " of " << size
<< ": Number of receive neighbors = " << num_recv_neighbors << endl;
cout << "Processor " << rank << " of " << size
<< ": Number of receive neighbors = " << num_recv_neighbors << endl;
if (debug)
cout << "Processor " << rank << " of " << size
<< ": Total number of elements to send = " << total_to_be_sent << endl;
cout << "Processor " << rank << " of " << size
<< ": Total number of elements to send = " << total_to_be_sent << endl;
if (debug) MPI_Barrier(MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
#endif
/////////////////////////////////////////////////////////////////////////
///
// Make a list of the neighbors that will send information to update our
// external elements (in the order that we will receive this information).
///
/////////////////////////////////////////////////////////////////////////
/* Make a list of the neighbors that will send information to update our
external elements (in the order that we will receive this information).*/
int* recv_list = allocate(ARRAY_ALIGNMENT, MAX_EXTERNAL * sizeof(int));
int* recv_list = new int[MAX_EXTERNAL];
j = 0;
// FIXME: Create local scope
int j = 0;
recv_list[j++] = new_external_processor[0];
for (i = 1; i < num_external; i++) {
for (int i = 1; i < num_external; i++) {
if (new_external_processor[i - 1] != new_external_processor[i]) {
recv_list[j++] = new_external_processor[i];
}
}
//
// Ensure that all the neighbors we expect to receive from also send to us
// Send a 0 length message to each of our recv neighbors
//
int send_list[num_send_neighbors];
int* send_list = new int[num_send_neighbors];
for (i = 0; i < num_send_neighbors; i++)
for (int i = 0; i < num_send_neighbors; i++) {
send_list[i] = 0;
}
//
// first post receives, these are immediate receives
// Do not wait for result to come, will do that at the
// wait call below.
//
int MPI_MY_TAG = 99;
MPI_Request* request = new MPI_Request[MAX_NUM_MESSAGES];
for (i = 0; i < num_send_neighbors; i++) {
MPI_Request request[MAX_NUM_MESSAGES];
for (int i = 0; i < num_send_neighbors; i++) {
MPI_Irecv(tmp_buffer + i,
1,
MPI_INT,
@ -347,88 +305,77 @@ void commPartition(Comm* c, Matrix* A)
}
// send messages
for (i = 0; i < num_recv_neighbors; i++)
for (int i = 0; i < num_recv_neighbors; i++) {
MPI_Send(tmp_buffer + i, 1, MPI_INT, recv_list[i], MPI_MY_TAG, MPI_COMM_WORLD);
///
// Receive message from each send neighbor to construct 'send_list'.
///
}
// Receive message from each send neighbor to construct 'send_list'.
MPI_Status status;
for (i = 0; i < num_send_neighbors; i++) {
for (int i = 0; i < num_send_neighbors; i++) {
if (MPI_Wait(request + i, &status)) {
cerr << "MPI_Wait error\n" << endl;
exit(-1);
printf("MPI_Wait error\n");
exit(EXIT_FAILURE);
}
send_list[i] = status.MPI_SOURCE;
}
/////////////////////////////////////////////////////////////////////////
///
// Compare the two lists. In most cases they should be the same.
/* Compare the two lists. In most cases they should be the same.
// However, if they are not then add new entries to the recv list
// that are in the send list (but not already in the recv list).
///
/////////////////////////////////////////////////////////////////////////
for (j = 0; j < num_send_neighbors; j++) {
WHY!! This ensures that the sendlist is equal to the sendlist
But why is this required? -> Just One neighbour list??*/
for (int j = 0; j < num_send_neighbors; j++) {
int found = 0;
for (i = 0; i < num_recv_neighbors; i++) {
for (int i = 0; i < num_recv_neighbors; i++) {
if (recv_list[i] == send_list[j]) found = 1;
}
if (found == 0) {
if (debug)
cout << "Processor " << rank << " of " << size << ": recv_list["
<< num_recv_neighbors << "] = " << send_list[j] << endl;
#ifdef VERBOSE
printf("Process %d of %d: recv_list[%d] = %d\n",
rank,
size,
num_recv_neighbors,
send_list[i]);
#endif
recv_list[num_recv_neighbors] = send_list[j];
(num_recv_neighbors)++;
}
}
delete[] send_list;
num_send_neighbors = num_recv_neighbors;
if (num_send_neighbors > MAX_NUM_MESSAGES) {
cerr << "Must increase MAX_EXTERNAL in HPC_Sparse_Matrix.hpp" << endl;
abort();
printf("Must increase MAX_EXTERNAL\n");
exit(EXIT_FAILURE);
}
/////////////////////////////////////////////////////////////////////////
/// Start filling HPC_Sparse_Matrix struct
/////////////////////////////////////////////////////////////////////////
A->total_to_be_sent = total_to_be_sent;
int* elements_to_send = new int[total_to_be_sent];
A->elements_to_send = elements_to_send;
for (i = 0; i < total_to_be_sent; i++)
elements_to_send[i] = 0;
//
// Start filling communication setup
// Create 'new_external' which explicitly put the external elements in the
// order given by 'external_local_index'
//
c->total_to_be_sent = total_to_be_sent;
int* elements_to_send = (int*)allocate(ARRAY_ALIGNMENT,
total_to_be_sent * sizeof(int));
c->elements_to_send = elements_to_send;
int* new_external = new int[num_external];
for (i = 0; i < num_external; i++) {
for (int i = 0; i < total_to_be_sent; i++) {
elements_to_send[i] = 0;
}
// Create 'new_external' which explicitly put the external elements in the
// order given by 'external_local_index'
int* new_external = (int*)allocate(ARRAY_ALIGNMENT, num_external * sizeof(int));
for (int i = 0; i < num_external; i++) {
new_external[external_local_index[i] - local_nrow] = external_index[i];
}
/////////////////////////////////////////////////////////////////////////
//
// Send each processor the global index list of the external elements in the
// order that I will want to receive them when updating my external elements
//
/////////////////////////////////////////////////////////////////////////
int* lengths = new int[num_recv_neighbors];
int lengths[num_recv_neighbors];
MPI_MY_TAG++;
// First post receives
for (i = 0; i < num_recv_neighbors; i++) {
for (int i = 0; i < num_recv_neighbors; i++) {
int partner = recv_list[i];
MPI_Irecv(lengths + i,
1,
@ -439,22 +386,18 @@ void commPartition(Comm* c, Matrix* A)
request + i);
}
int* neighbors = new int[MAX_NUM_NEIGHBOURS];
int* recv_length = new int[MAX_NUM_NEIGHBOURS];
int* send_length = new int[MAX_NUM_NEIGHBOURS];
A->neighbors = neighbors;
A->recv_length = recv_length;
A->send_length = send_length;
int* neighbors = c->neighbors;
int* recv_length = c->recv_length;
int* send_length = c->send_length;
j = 0;
for (i = 0; i < num_recv_neighbors; i++) {
for (int i = 0; i < num_recv_neighbors; i++) {
int start = j;
int newlength = 0;
// go through list of external elements until updating
// processor changes
while ((j < num_external) && (new_external_processor[j] == recv_list[i])) {
newlength++;
j++;
@ -469,25 +412,21 @@ void commPartition(Comm* c, Matrix* A)
}
// Complete the receives of the number of externals
for (i = 0; i < num_recv_neighbors; i++) {
for (int i = 0; i < num_recv_neighbors; i++) {
if (MPI_Wait(request + i, &status)) {
cerr << "MPI_Wait error\n" << endl;
exit(-1);
printf("MPI_Wait error\n");
exit(EXIT_FAILURE);
}
send_length[i] = lengths[i];
}
delete[] lengths;
///////////////////////////////////////////////////////////////////
// Build "elements_to_send" list. These are the x elements I own
// that need to be sent to other processors.
///////////////////////////////////////////////////////////////////
MPI_MY_TAG++;
j = 0;
for (i = 0; i < num_recv_neighbors; i++) {
for (int i = 0; i < num_recv_neighbors; i++) {
MPI_Irecv(elements_to_send + j,
send_length[i],
MPI_INT,
@ -499,14 +438,14 @@ void commPartition(Comm* c, Matrix* A)
}
j = 0;
for (i = 0; i < num_recv_neighbors; i++) {
for (int i = 0; i < num_recv_neighbors; i++) {
int start = j;
int newlength = 0;
// Go through list of external elements
// until updating processor changes. This is redundant, but
// saves us from recording this information.
while ((j < num_external) && (new_external_processor[j] == recv_list[i])) {
newlength++;
@ -522,39 +461,101 @@ void commPartition(Comm* c, Matrix* A)
}
// receive from each neighbor the global index list of external elements
for (i = 0; i < num_recv_neighbors; i++) {
for (int i = 0; i < num_recv_neighbors; i++) {
if (MPI_Wait(request + i, &status)) {
cerr << "MPI_Wait error\n" << endl;
exit(-1);
printf("MPI_Wait error\n");
exit(EXIT_FAILURE);
}
}
/// replace global indices by local indices ///
for (i = 0; i < total_to_be_sent; i++)
/// replace global indices by local indices
for (int i = 0; i < total_to_be_sent; i++) {
elements_to_send[i] -= start_row;
}
////////////////
// Finish up !!
////////////////
c->num_send_neighbors = num_send_neighbors;
A->nc = A->nc + num_external;
A->num_send_neighbors = num_send_neighbors;
A->local_ncol = A->local_nrow + num_external;
// Used in exchange
CG_FLOAT* send_buffer = (CG_FLOAT*)allocate(ARRAY_ALIGNMENT,
total_to_be_sent * sizeof(CG_FLOAT));
c->send_buffer = send_buffer;
// Used in exchange_externals
double* send_buffer = new double[total_to_be_sent];
A->send_buffer = send_buffer;
free(recv_list);
free(new_external);
#endif
}
delete[] tmp_buffer;
delete[] global_index_offsets;
delete[] recv_list;
delete[] external_processor;
delete[] new_external;
delete[] new_external_processor;
delete[] request;
void commExchange(Comm* c, Matrix* A, double* x)
{
#ifdef _MPI
int num_external = 0;
return;
// Extract Matrix pieces
int local_nrow = A->nr;
int num_neighbors = c->num_send_neighbors;
int* recv_length = c->recv_length;
int* send_length = c->send_length;
int* neighbors = c->neighbors;
double* send_buffer = c->send_buffer;
int total_to_be_sent = c->total_to_be_sent;
int* elements_to_send = c->elements_to_send;
int rank = c->rank;
int size = c->size;
MPI_Comm comm = c->comm;
// first post receives, these are immediate receives
// Do not wait for result to come, will do that at the
// wait call below.
int MPI_MY_TAG = 99;
MPI_Request request[num_neighbors];
// Externals are at end of locals
double* x_external = (double*)x + local_nrow;
// Post receives first
for (int i = 0; i < num_neighbors; i++) {
int n_recv = recv_length[i];
MPI_Irecv(x_external,
n_recv,
MPI_DOUBLE,
neighbors[i],
MPI_MY_TAG,
MPI_COMM_WORLD,
request + i);
x_external += n_recv;
}
// Fill up send buffer
for (int i = 0; i < total_to_be_sent; i++) {
send_buffer[i] = x[elements_to_send[i]];
}
// Send to each neighbor
for (int i = 0; i < num_neighbors; i++) {
int n_send = send_length[i];
MPI_Send(send_buffer,
n_send,
MPI_DOUBLE,
neighbors[i],
MPI_MY_TAG,
MPI_COMM_WORLD);
send_buffer += n_send;
}
// Complete the reads issued above
MPI_Status status;
for (int i = 0; i < num_neighbors; i++) {
if (MPI_Wait(request + i, &status)) {
printf("MPI_Wait error\n");
exit(EXIT_FAILURE);
}
}
#endif
}
void commPrintConfig(Comm* c)
@ -568,20 +569,6 @@ void commPrintConfig(Comm* c)
for (int i = 0; i < c->size; i++) {
if (i == c->rank) {
printf("\tRank %d of %d\n", c->rank, c->size);
printf("\tNeighbours (bottom, top, left, right): %d %d, %d, %d\n",
c->neighbours[BOTTOM],
c->neighbours[TOP],
c->neighbours[LEFT],
c->neighbours[RIGHT]);
printf("\tIs boundary:\n");
printf("\t\tLEFT: %d\n", commIsBoundary(c, LEFT));
printf("\t\tRIGHT: %d\n", commIsBoundary(c, RIGHT));
printf("\t\tBOTTOM: %d\n", commIsBoundary(c, BOTTOM));
printf("\t\tTOP: %d\n", commIsBoundary(c, TOP));
printf("\tCoordinates (i,j) %d %d\n", c->coords[IDIM], c->coords[JDIM]);
printf("\tDims (i,j) %d %d\n", c->dims[IDIM], c->dims[JDIM]);
printf("\tLocal domain size (i,j) %dx%d\n", c->imaxLocal, c->jmaxLocal);
fflush(stdout);
}
MPI_Barrier(MPI_COMM_WORLD);

View File

@ -10,6 +10,10 @@
#include "matrix.h"
#define MAX_EXTERNAL 100000
#define MAX_NUM_MESSAGES 500
#define MAX_NUM_NEIGHBOURS MAX_NUM_MESSAGES
enum op { MAX = 0, SUM };
typedef struct {
@ -24,9 +28,9 @@ typedef struct {
int* external_local_index;
int total_to_be_sent;
int* elements_to_send;
int* neighbors;
int* recv_length;
int* send_length;
int neighbors[MAX_NUM_NEIGHBOURS];
int recv_length[MAX_NUM_NEIGHBOURS];
int send_length[MAX_NUM_NEIGHBOURS];
double* send_buffer;
#endif
} Comm;
@ -36,7 +40,7 @@ extern void commInit(Comm* c, int argc, char** argv);
extern void commFinalize(Comm* c);
extern void commPartition(Comm*, Matrix* m);
extern void commPrintConfig(Comm*);
extern void commExchange(Comm*, double*);
extern void commExchange(Comm* c, Matrix* A, double* x);
extern void commReduction(double* v, int op);
static inline int commIsMaster(Comm* c) { return c->rank == 0; }