From 9f37fa73a98043129240ed91384105e65c58b911 Mon Sep 17 00:00:00 2001
From: Jan Eitzinger <jan@moebiusband.org>
Date: Fri, 17 Jan 2025 20:30:18 +0100
Subject: [PATCH] Partial port of communication code

---
 src/comm.c | 493 ++++++++++++++++++++++++++---------------------------
 src/comm.h |  12 +-
 2 files changed, 248 insertions(+), 257 deletions(-)

diff --git a/src/comm.c b/src/comm.c
index d071965..e71819e 100644
--- a/src/comm.c
+++ b/src/comm.c
@@ -14,10 +14,6 @@
 #include "allocate.h"
 #include "comm.h"
 
-#define MAX_EXTERNAL       100000
-#define MAX_NUM_MESSAGES   500
-#define MAX_NUM_NEIGHBOURS MAX_NUM_MESSAGES
-
 // subroutines local to this module
 int sizeOfRank(int rank, int size, int N)
 {
@@ -37,6 +33,7 @@ void commReduction(double* v, int op)
 
 void commPartition(Comm* c, Matrix* A)
 {
+#ifdef _MPI
     int rank      = c->rank;
     int size      = c->size;
     MPI_Comm comm = c->comm;
@@ -66,15 +63,12 @@ void commPartition(Comm* c, Matrix* A)
     //     - find out which processor owns the value.
     //     - Set up communication for sparse MV operation.
 
-    ///////////////////////////////////////////
     // Scan the indices and transform to local
-    ///////////////////////////////////////////
+    int* externals   = (int*)allocate(ARRAY_ALIGNMENT, A->totalNr * sizeof(int));
+    int num_external = 1;
 
     int* external_index = (int*)allocate(ARRAY_ALIGNMENT, MAX_EXTERNAL * sizeof(int));
-    int* externals      = (int*)allocate(ARRAY_ALIGNMENT, A->totalNr * sizeof(int));
-    int num_external    = 1;
-
-    c->external_index = external_index;
+    c->external_index   = external_index;
 
     for (int i = 0; i < A->totalNr; i++) {
         externals[i] = -1;
@@ -95,49 +89,39 @@ void commPartition(Comm* c, Matrix* A)
             // shift local rows to the start
             if (start_row <= cur_ind && cur_ind <= stop_row) {
                 col_ind[j] -= start_row;
-            } else // Must find out if we have already set up this point
-            {
+            } else {
+                // Must find out if we have already set up this point
                 if (externals[cur_ind] == -1) {
                     externals[cur_ind] = num_external++;
 
                     if (num_external <= MAX_EXTERNAL) {
                         external_index[num_external - 1] = cur_ind;
                         // Mark index as external by negating it
-                        ptr_to_inds_in_row[i][j] = -(ptr_to_inds_in_row[i][j] + 1);
+                        col_ind[j] = -col_ind[j];
                     } else {
-                        cerr << "Must increase MAX_EXTERNAL in HPC_Sparse_Matrix.hpp"
-                             << endl;
-                        abort();
+                        printf("Must increase MAX_EXTERNAL\n");
+                        exit(EXIT_FAILURE);
                     }
                 } else {
-                    // Mark index as external by adding 1 and negating it
-                    ptr_to_inds_in_row[i][j] = -(ptr_to_inds_in_row[i][j] + 1);
+                    // Mark index as external by negating it
+                    col_ind[j] = -col_ind[j];
                 }
             }
         }
     }
 
-    ////////////////////////////////////////////////////////////////////////////
-    // Go through list of externals to find out which processors must be accessed.
-    ////////////////////////////////////////////////////////////////////////////
+    /**************************************************************************
+    Go through list of externals to find out which processors must be accessed.
+    **************************************************************************/
+    c->num_external = num_external;
+    int tmp_buffer[size];
+    int global_index_offsets[size];
 
-    A->num_external = num_external;
-    int* tmp_buffer = new int[size]; // Temp buffer space needed below
+    for (int i = 0; i < size; i++) {
+        tmp_buffer[i] = 0;
+    }
 
-    // Build list of global index offset
-
-    int* global_index_offsets = new int[size];
-    for (i = 0; i < size; i++)
-        tmp_buffer[i] = 0; // First zero out
-
-    tmp_buffer[rank] = start_row; // This is my start row
-
-    // This call sends the start_row of each ith processor to the ith
-    // entry of global_index_offset on all processors.
-    // Thus, each processor know the range of indices owned by all
-    // other processors.
-    // Note:  There might be a better algorithm for doing this, but this
-    //        will work...
+    tmp_buffer[rank] = start_row;
 
     MPI_Allreduce(tmp_buffer,
         global_index_offsets,
@@ -147,105 +131,97 @@ void commPartition(Comm* c, Matrix* A)
         MPI_COMM_WORLD);
 
     // Go through list of externals and find the processor that owns each
-    int* external_processor     = new int[num_external];
-    int* new_external_processor = new int[num_external];
+    int external_processor[num_external];
 
-    for (i = 0; i < num_external; i++) {
+    for (int i = 0; i < num_external; i++) {
         int cur_ind = external_index[i];
-        for (int j = size - 1; j >= 0; j--)
+        for (int j = size - 1; j >= 0; j--) {
             if (global_index_offsets[j] <= cur_ind) {
                 external_processor[i] = j;
                 break;
             }
-    }
-    if (debug) {
-        t0 = mytimer() - t0;
-        cout << "          Time in finding processors phase = " << t0 << endl;
+        }
     }
 
-    ////////////////////////////////////////////////////////////////////////////
-    // Sift through the external elements. For each newly encountered external
-    // point assign it the next index in the sequence. Then look for other
-    // external elements who are update by the same node and assign them the next
-    // set of index numbers in the sequence (ie. elements updated by the same node
-    // have consecutive indices).
-    ////////////////////////////////////////////////////////////////////////////
-
-    if (debug) t0 = mytimer();
+    /*Go through the external elements. For each newly encountered external
+    point assign it the next index in the local sequence. Then look for other
+    external elements who are updated by the same node and assign them the next
+    set of index numbers in the local sequence (ie. elements updated by the same node
+    have consecutive indices).*/
+    int* external_local_index = (int*)allocate(ARRAY_ALIGNMENT,
+        MAX_EXTERNAL * sizeof(int));
+    c->external_local_index   = external_local_index;
 
     int count = local_nrow;
-    for (i = 0; i < num_external; i++)
-        external_local_index[i] = -1;
 
-    for (i = 0; i < num_external; i++) {
+    for (int i = 0; i < num_external; i++) {
+        external_local_index[i] = -1;
+    }
+
+    for (int i = 0; i < num_external; i++) {
         if (external_local_index[i] == -1) {
             external_local_index[i] = count++;
 
-            for (j = i + 1; j < num_external; j++) {
-                if (external_processor[j] == external_processor[i])
+            for (int j = i + 1; j < num_external; j++) {
+                if (external_processor[j] == external_processor[i]) {
                     external_local_index[j] = count++;
+                }
             }
         }
     }
 
-    if (debug) {
-        t0 = mytimer() - t0;
-        cout << "           Time in scanning external indices phase = " << t0 << endl;
-    }
-    if (debug) t0 = mytimer();
+    // map all external ids to the new local index
+    CG_UINT* rowPtr = A->rowPtr;
 
-    for (i = 0; i < local_nrow; i++) {
-        for (j = 0; j < nnz_in_row[i]; j++) {
-            if (ptr_to_inds_in_row[i][j] < 0) // Change index values of externals
-            {
-                int cur_ind              = -ptr_to_inds_in_row[i][j] - 1;
-                ptr_to_inds_in_row[i][j] = external_local_index[externals[cur_ind]];
+    for (int i = 0; i < local_nrow; i++) {
+        for (int j = rowPtr[i]; j < rowPtr[i + 1]; j++) {
+            if (col_ind[j] < 0) {
+                int cur_ind = -col_ind[j] - 1; // FIXME: Offset by 1??
+                col_ind[j]  = external_local_index[externals[cur_ind]];
             }
         }
     }
 
-    for (i = 0; i < num_external; i++)
+    int new_external_processor[num_external];
+
+    for (int i = 0; i < num_external; i++) {
         new_external_processor[i] = 0;
+    }
 
-    for (i = 0; i < num_external; i++)
+    // setup map from external id to partition
+    for (int i = 0; i < num_external; i++) {
         new_external_processor[external_local_index[i] - local_nrow] =
             external_processor[i];
-
-    if (debug) {
-        t0 = mytimer() - t0;
-        cout << "           Time in assigning external indices phase = " << t0 << endl;
     }
 
-    if (debug_details) {
-        for (i = 0; i < num_external; i++) {
-            cout << "Processor " << rank << " of " << size << ": external processor[" << i
-                 << "] = " << external_processor[i] << endl;
-            cout << "Processor " << rank << " of " << size << ": new external processor["
-                 << i << "] = " << new_external_processor[i] << endl;
-        }
+#ifdef VERBOSE
+    for (int i = 0; i < num_external; i++) {
+        printf("Process %d of %d: external process[%d] = %d\n",
+            rank,
+            size,
+            i,
+            external_processor[i]);
     }
+#endif
 
-    ////////////////////////////////////////////////////////////////////////////
-    ///
-    // Count the number of neighbors from which we receive information to update
-    // our external elements. Additionally, fill the array tmp_neighbors in the
-    // following way:
-    //      tmp_neighbors[i] = 0   ==>  No external elements are updated by
-    //                              processor i.
-    //      tmp_neighbors[i] = x   ==>  (x-1)/size elements are updated from
-    //                              processor i.
-    ///
-    ////////////////////////////////////////////////////////////////////////////
+    /* Count the number of neighbors from which we receive information to update
+     our external elements. Additionally, fill the array tmp_neighbors in the
+     following way:
+          tmp_neighbors[i] = 0   ==>  No external elements are updated by
+                                  processor i.
+          tmp_neighbors[i] = x   ==>  (x-1)/size elements are updated from
+                                  processor i.*/
 
-    t0                 = mytimer();
-    int* tmp_neighbors = new int[size];
-    for (i = 0; i < size; i++)
+    int tmp_neighbors[size];
+
+    for (int i = 0; i < size; i++) {
         tmp_neighbors[i] = 0;
+    }
 
     int num_recv_neighbors = 0;
     int length             = 1;
 
-    for (i = 0; i < num_external; i++) {
+    for (int i = 0; i < num_external; i++) {
         if (tmp_neighbors[new_external_processor[i]] == 0) {
             num_recv_neighbors++;
             tmp_neighbors[new_external_processor[i]] = 1;
@@ -253,90 +229,72 @@ void commPartition(Comm* c, Matrix* A)
         tmp_neighbors[new_external_processor[i]] += size;
     }
 
-    /// sum over all processors all the tmp_neighbors arrays ///
-
+    // sum over all processors all the tmp_neighbors arrays
     MPI_Allreduce(tmp_neighbors, tmp_buffer, size, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
 
-    /// decode the combined 'tmp_neighbors' (stored in tmp_buffer)
-    //  array from all the processors
-
+    /* decode the combined 'tmp_neighbors' (stored in tmp_buffer) array from all the
+     * processors */
     int num_send_neighbors = tmp_buffer[rank] % size;
 
-    /// decode 'tmp_buffer[rank] to deduce total number of elements
-    //  we must send
-
+    /* decode 'tmp_buffer[rank] to deduce total number of elements we must send */
     int total_to_be_sent = (tmp_buffer[rank] - num_send_neighbors) / size;
 
-    //
-    // Check to see if we have enough workspace allocated.  This could be
-    // dynamically modified, but let's keep it simple for now...
-    //
-
+    /* Check to see if we have enough workspace allocated.  This could be
+     dynamically modified, but let's keep it simple for now...*/
     if (num_send_neighbors > MAX_NUM_MESSAGES) {
-        cerr << "Must increase MAX_NUM_MESSAGES in HPC_Sparse_Matrix.hpp" << endl;
-        cerr << "Must be at least " << num_send_neighbors << endl;
-        abort();
+        printf("Must increase MAX_NUM_MESSAGES. Must be at least %d\n",
+            num_send_neighbors);
+        exit(EXIT_FAILURE);
     }
 
     if (total_to_be_sent > MAX_EXTERNAL) {
-        cerr << "Must increase MAX_EXTERNAL in HPC_Sparse_Matrix.hpp" << endl;
-        cerr << "Must be at least " << total_to_be_sent << endl;
-        abort();
+        printf("Must increase MAX_EXTERNAL. Must be at least %d\n", total_to_be_sent);
+        exit(EXIT_FAILURE);
     }
-    delete[] tmp_neighbors;
 
-    if (debug) {
-        t0 = mytimer() - t0;
-        cout << "           Time in finding neighbors phase = " << t0 << endl;
-    }
-    if (debug)
-        cout << "Processor " << rank << " of " << size
-             << ": Number of send neighbors = " << num_send_neighbors << endl;
+#ifdef VERBOSE
+    cout << "Processor " << rank << " of " << size
+         << ": Number of send neighbors = " << num_send_neighbors << endl;
 
-    if (debug)
-        cout << "Processor " << rank << " of " << size
-             << ": Number of receive neighbors = " << num_recv_neighbors << endl;
+    cout << "Processor " << rank << " of " << size
+         << ": Number of receive neighbors = " << num_recv_neighbors << endl;
 
-    if (debug)
-        cout << "Processor " << rank << " of " << size
-             << ": Total number of elements to send = " << total_to_be_sent << endl;
+    cout << "Processor " << rank << " of " << size
+         << ": Total number of elements to send = " << total_to_be_sent << endl;
 
-    if (debug) MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
 
-    /////////////////////////////////////////////////////////////////////////
-    ///
-    // Make a list of the neighbors that will send information to update our
-    // external elements (in the order that we will receive this information).
-    ///
-    /////////////////////////////////////////////////////////////////////////
+    /* Make a list of the neighbors that will send information to update our
+     external elements (in the order that we will receive this information).*/
+    int* recv_list = allocate(ARRAY_ALIGNMENT, MAX_EXTERNAL * sizeof(int));
 
-    int* recv_list = new int[MAX_EXTERNAL];
-
-    j              = 0;
+    // FIXME: Create local scope
+    int j          = 0;
     recv_list[j++] = new_external_processor[0];
-    for (i = 1; i < num_external; i++) {
+
+    for (int i = 1; i < num_external; i++) {
         if (new_external_processor[i - 1] != new_external_processor[i]) {
             recv_list[j++] = new_external_processor[i];
         }
     }
 
-    //
+    // Ensure that all the neighbors we expect to receive from also send to us
     // Send a 0 length message to each of our recv neighbors
-    //
+    int send_list[num_send_neighbors];
 
-    int* send_list = new int[num_send_neighbors];
-    for (i = 0; i < num_send_neighbors; i++)
+    for (int i = 0; i < num_send_neighbors; i++) {
         send_list[i] = 0;
+    }
 
-    //
     //  first post receives, these are immediate receives
     //  Do not wait for result to come, will do that at the
     //  wait call below.
-    //
     int MPI_MY_TAG = 99;
 
-    MPI_Request* request = new MPI_Request[MAX_NUM_MESSAGES];
-    for (i = 0; i < num_send_neighbors; i++) {
+    MPI_Request request[MAX_NUM_MESSAGES];
+
+    for (int i = 0; i < num_send_neighbors; i++) {
         MPI_Irecv(tmp_buffer + i,
             1,
             MPI_INT,
@@ -347,88 +305,77 @@ void commPartition(Comm* c, Matrix* A)
     }
 
     // send messages
-
-    for (i = 0; i < num_recv_neighbors; i++)
+    for (int i = 0; i < num_recv_neighbors; i++) {
         MPI_Send(tmp_buffer + i, 1, MPI_INT, recv_list[i], MPI_MY_TAG, MPI_COMM_WORLD);
-    ///
-    // Receive message from each send neighbor to construct 'send_list'.
-    ///
+    }
 
+    // Receive message from each send neighbor to construct 'send_list'.
     MPI_Status status;
-    for (i = 0; i < num_send_neighbors; i++) {
+    for (int i = 0; i < num_send_neighbors; i++) {
         if (MPI_Wait(request + i, &status)) {
-            cerr << "MPI_Wait error\n" << endl;
-            exit(-1);
+            printf("MPI_Wait error\n");
+            exit(EXIT_FAILURE);
         }
         send_list[i] = status.MPI_SOURCE;
     }
 
-    /////////////////////////////////////////////////////////////////////////
-    ///
-    //  Compare the two lists. In most cases they should be the same.
+    /*  Compare the two lists. In most cases they should be the same.
     //  However, if they are not then add new entries to the recv list
     //  that are in the send list (but not already in the recv list).
-    ///
-    /////////////////////////////////////////////////////////////////////////
-
-    for (j = 0; j < num_send_neighbors; j++) {
+    WHY!! This ensures that the sendlist is equal to the sendlist
+    But why is this required? -> Just One neighbour list??*/
+    for (int j = 0; j < num_send_neighbors; j++) {
         int found = 0;
-        for (i = 0; i < num_recv_neighbors; i++) {
+        for (int i = 0; i < num_recv_neighbors; i++) {
             if (recv_list[i] == send_list[j]) found = 1;
         }
 
         if (found == 0) {
-            if (debug)
-                cout << "Processor " << rank << " of " << size << ": recv_list["
-                     << num_recv_neighbors << "] = " << send_list[j] << endl;
+#ifdef VERBOSE
+            printf("Process %d of %d: recv_list[%d] = %d\n",
+                rank,
+                size,
+                num_recv_neighbors,
+                send_list[i]);
+#endif
             recv_list[num_recv_neighbors] = send_list[j];
             (num_recv_neighbors)++;
         }
     }
-
-    delete[] send_list;
     num_send_neighbors = num_recv_neighbors;
 
     if (num_send_neighbors > MAX_NUM_MESSAGES) {
-        cerr << "Must increase MAX_EXTERNAL in HPC_Sparse_Matrix.hpp" << endl;
-        abort();
+        printf("Must increase MAX_EXTERNAL\n");
+        exit(EXIT_FAILURE);
     }
 
-    /////////////////////////////////////////////////////////////////////////
-    /// Start filling HPC_Sparse_Matrix struct
-    /////////////////////////////////////////////////////////////////////////
-
-    A->total_to_be_sent   = total_to_be_sent;
-    int* elements_to_send = new int[total_to_be_sent];
-    A->elements_to_send   = elements_to_send;
-
-    for (i = 0; i < total_to_be_sent; i++)
-        elements_to_send[i] = 0;
-
-    //
+    // Start filling communication setup
     // Create 'new_external' which explicitly put the external elements in the
     // order given by 'external_local_index'
-    //
+    c->total_to_be_sent   = total_to_be_sent;
+    int* elements_to_send = (int*)allocate(ARRAY_ALIGNMENT,
+        total_to_be_sent * sizeof(int));
+    c->elements_to_send   = elements_to_send;
 
-    int* new_external = new int[num_external];
-    for (i = 0; i < num_external; i++) {
+    for (int i = 0; i < total_to_be_sent; i++) {
+        elements_to_send[i] = 0;
+    }
+
+    // Create 'new_external' which explicitly put the external elements in the
+    // order given by 'external_local_index'
+    int* new_external = (int*)allocate(ARRAY_ALIGNMENT, num_external * sizeof(int));
+
+    for (int i = 0; i < num_external; i++) {
         new_external[external_local_index[i] - local_nrow] = external_index[i];
     }
 
-    /////////////////////////////////////////////////////////////////////////
-    //
     // Send each processor the global index list of the external elements in the
     // order that I will want to receive them when updating my external elements
-    //
-    /////////////////////////////////////////////////////////////////////////
-
-    int* lengths = new int[num_recv_neighbors];
-
+    int lengths[num_recv_neighbors];
     MPI_MY_TAG++;
 
     // First post receives
-
-    for (i = 0; i < num_recv_neighbors; i++) {
+    for (int i = 0; i < num_recv_neighbors; i++) {
         int partner = recv_list[i];
         MPI_Irecv(lengths + i,
             1,
@@ -439,22 +386,18 @@ void commPartition(Comm* c, Matrix* A)
             request + i);
     }
 
-    int* neighbors   = new int[MAX_NUM_NEIGHBOURS];
-    int* recv_length = new int[MAX_NUM_NEIGHBOURS];
-    int* send_length = new int[MAX_NUM_NEIGHBOURS];
-
-    A->neighbors   = neighbors;
-    A->recv_length = recv_length;
-    A->send_length = send_length;
+    int* neighbors   = c->neighbors;
+    int* recv_length = c->recv_length;
+    int* send_length = c->send_length;
 
     j = 0;
-    for (i = 0; i < num_recv_neighbors; i++) {
+
+    for (int i = 0; i < num_recv_neighbors; i++) {
         int start     = j;
         int newlength = 0;
 
         // go through list of external elements until updating
         // processor changes
-
         while ((j < num_external) && (new_external_processor[j] == recv_list[i])) {
             newlength++;
             j++;
@@ -469,25 +412,21 @@ void commPartition(Comm* c, Matrix* A)
     }
 
     // Complete the receives of the number of externals
-
-    for (i = 0; i < num_recv_neighbors; i++) {
+    for (int i = 0; i < num_recv_neighbors; i++) {
         if (MPI_Wait(request + i, &status)) {
-            cerr << "MPI_Wait error\n" << endl;
-            exit(-1);
+            printf("MPI_Wait error\n");
+            exit(EXIT_FAILURE);
         }
         send_length[i] = lengths[i];
     }
-    delete[] lengths;
 
-    ///////////////////////////////////////////////////////////////////
     // Build "elements_to_send" list.  These are the x elements I own
     // that need to be sent to other processors.
-    ///////////////////////////////////////////////////////////////////
-
     MPI_MY_TAG++;
 
     j = 0;
-    for (i = 0; i < num_recv_neighbors; i++) {
+
+    for (int i = 0; i < num_recv_neighbors; i++) {
         MPI_Irecv(elements_to_send + j,
             send_length[i],
             MPI_INT,
@@ -499,14 +438,14 @@ void commPartition(Comm* c, Matrix* A)
     }
 
     j = 0;
-    for (i = 0; i < num_recv_neighbors; i++) {
+
+    for (int i = 0; i < num_recv_neighbors; i++) {
         int start     = j;
         int newlength = 0;
 
         // Go through list of external elements
         // until updating processor changes.  This is redundant, but
         // saves us from recording this information.
-
         while ((j < num_external) && (new_external_processor[j] == recv_list[i])) {
 
             newlength++;
@@ -522,39 +461,101 @@ void commPartition(Comm* c, Matrix* A)
     }
 
     // receive from each neighbor the global index list of external elements
-
-    for (i = 0; i < num_recv_neighbors; i++) {
+    for (int i = 0; i < num_recv_neighbors; i++) {
         if (MPI_Wait(request + i, &status)) {
-            cerr << "MPI_Wait error\n" << endl;
-            exit(-1);
+            printf("MPI_Wait error\n");
+            exit(EXIT_FAILURE);
         }
     }
 
-    /// replace global indices by local indices ///
-
-    for (i = 0; i < total_to_be_sent; i++)
+    /// replace global indices by local indices
+    for (int i = 0; i < total_to_be_sent; i++) {
         elements_to_send[i] -= start_row;
+    }
 
-    ////////////////
     // Finish up !!
-    ////////////////
+    c->num_send_neighbors = num_send_neighbors;
+    A->nc                 = A->nc + num_external;
 
-    A->num_send_neighbors = num_send_neighbors;
-    A->local_ncol         = A->local_nrow + num_external;
+    // Used in exchange
+    CG_FLOAT* send_buffer = (CG_FLOAT*)allocate(ARRAY_ALIGNMENT,
+        total_to_be_sent * sizeof(CG_FLOAT));
+    c->send_buffer        = send_buffer;
 
-    // Used in exchange_externals
-    double* send_buffer = new double[total_to_be_sent];
-    A->send_buffer      = send_buffer;
+    free(recv_list);
+    free(new_external);
+#endif
+}
 
-    delete[] tmp_buffer;
-    delete[] global_index_offsets;
-    delete[] recv_list;
-    delete[] external_processor;
-    delete[] new_external;
-    delete[] new_external_processor;
-    delete[] request;
+void commExchange(Comm* c, Matrix* A, double* x)
+{
+#ifdef _MPI
+    int num_external = 0;
 
-    return;
+    // Extract Matrix pieces
+
+    int local_nrow        = A->nr;
+    int num_neighbors     = c->num_send_neighbors;
+    int* recv_length      = c->recv_length;
+    int* send_length      = c->send_length;
+    int* neighbors        = c->neighbors;
+    double* send_buffer   = c->send_buffer;
+    int total_to_be_sent  = c->total_to_be_sent;
+    int* elements_to_send = c->elements_to_send;
+
+    int rank      = c->rank;
+    int size      = c->size;
+    MPI_Comm comm = c->comm;
+
+    //  first post receives, these are immediate receives
+    //  Do not wait for result to come, will do that at the
+    //  wait call below.
+    int MPI_MY_TAG = 99;
+
+    MPI_Request request[num_neighbors];
+
+    // Externals are at end of locals
+    double* x_external = (double*)x + local_nrow;
+
+    // Post receives first
+    for (int i = 0; i < num_neighbors; i++) {
+        int n_recv = recv_length[i];
+        MPI_Irecv(x_external,
+            n_recv,
+            MPI_DOUBLE,
+            neighbors[i],
+            MPI_MY_TAG,
+            MPI_COMM_WORLD,
+            request + i);
+        x_external += n_recv;
+    }
+
+    // Fill up send buffer
+    for (int i = 0; i < total_to_be_sent; i++) {
+        send_buffer[i] = x[elements_to_send[i]];
+    }
+
+    // Send to each neighbor
+    for (int i = 0; i < num_neighbors; i++) {
+        int n_send = send_length[i];
+        MPI_Send(send_buffer,
+            n_send,
+            MPI_DOUBLE,
+            neighbors[i],
+            MPI_MY_TAG,
+            MPI_COMM_WORLD);
+        send_buffer += n_send;
+    }
+
+    // Complete the reads issued above
+    MPI_Status status;
+    for (int i = 0; i < num_neighbors; i++) {
+        if (MPI_Wait(request + i, &status)) {
+            printf("MPI_Wait error\n");
+            exit(EXIT_FAILURE);
+        }
+    }
+#endif
 }
 
 void commPrintConfig(Comm* c)
@@ -568,20 +569,6 @@ void commPrintConfig(Comm* c)
 
     for (int i = 0; i < c->size; i++) {
         if (i == c->rank) {
-            printf("\tRank %d of %d\n", c->rank, c->size);
-            printf("\tNeighbours (bottom, top, left, right): %d %d, %d, %d\n",
-                c->neighbours[BOTTOM],
-                c->neighbours[TOP],
-                c->neighbours[LEFT],
-                c->neighbours[RIGHT]);
-            printf("\tIs boundary:\n");
-            printf("\t\tLEFT: %d\n", commIsBoundary(c, LEFT));
-            printf("\t\tRIGHT: %d\n", commIsBoundary(c, RIGHT));
-            printf("\t\tBOTTOM: %d\n", commIsBoundary(c, BOTTOM));
-            printf("\t\tTOP: %d\n", commIsBoundary(c, TOP));
-            printf("\tCoordinates (i,j) %d %d\n", c->coords[IDIM], c->coords[JDIM]);
-            printf("\tDims (i,j) %d %d\n", c->dims[IDIM], c->dims[JDIM]);
-            printf("\tLocal domain size (i,j) %dx%d\n", c->imaxLocal, c->jmaxLocal);
             fflush(stdout);
         }
         MPI_Barrier(MPI_COMM_WORLD);
diff --git a/src/comm.h b/src/comm.h
index dfe296e..5e5c1af 100644
--- a/src/comm.h
+++ b/src/comm.h
@@ -10,6 +10,10 @@
 
 #include "matrix.h"
 
+#define MAX_EXTERNAL       100000
+#define MAX_NUM_MESSAGES   500
+#define MAX_NUM_NEIGHBOURS MAX_NUM_MESSAGES
+
 enum op { MAX = 0, SUM };
 
 typedef struct {
@@ -24,9 +28,9 @@ typedef struct {
     int* external_local_index;
     int total_to_be_sent;
     int* elements_to_send;
-    int* neighbors;
-    int* recv_length;
-    int* send_length;
+    int neighbors[MAX_NUM_NEIGHBOURS];
+    int recv_length[MAX_NUM_NEIGHBOURS];
+    int send_length[MAX_NUM_NEIGHBOURS];
     double* send_buffer;
 #endif
 } Comm;
@@ -36,7 +40,7 @@ extern void commInit(Comm* c, int argc, char** argv);
 extern void commFinalize(Comm* c);
 extern void commPartition(Comm*, Matrix* m);
 extern void commPrintConfig(Comm*);
-extern void commExchange(Comm*, double*);
+extern void commExchange(Comm* c, Matrix* A, double* x);
 extern void commReduction(double* v, int op);
 
 static inline int commIsMaster(Comm* c) { return c->rank == 0; }