From 49dc2d97aeedb3ca19e59963e9b5eec27f61c95b Mon Sep 17 00:00:00 2001
From: Jan Eitzinger <jan@moebiusband.org>
Date: Mon, 20 Jan 2025 18:24:37 +0100
Subject: [PATCH] Fix bugs and clean up. Add subroutines.

---
 hpcg.par            |   6 +-
 mk/include_CLANG.mk |   2 +-
 src/comm.c          | 687 ++++++++++++++++++++++++--------------------
 src/comm.h          |  19 +-
 src/main.c          |   6 +-
 src/solver.c        |  16 +-
 src/util.h          |   5 +-
 7 files changed, 396 insertions(+), 345 deletions(-)

diff --git a/hpcg.par b/hpcg.par
index ce11b32..ed20133 100644
--- a/hpcg.par
+++ b/hpcg.par
@@ -3,9 +3,9 @@
 #==============================================================================
 
 filename generate
-nx 50
-ny 50
-nz 50
+nx 10
+ny 10
+nz 10
 
 itermax 10000
 eps 0.0001
diff --git a/mk/include_CLANG.mk b/mk/include_CLANG.mk
index b3b2415..d0641ab 100644
--- a/mk/include_CLANG.mk
+++ b/mk/include_CLANG.mk
@@ -17,5 +17,5 @@ VERSION  = --version
 CFLAGS   = -O3 -ffast-math -std=c99 $(OPENMP)
 #CFLAGS   = -Ofast -fnt-store=aggressive  -std=c99 $(OPENMP) #AMD CLANG
 LFLAGS   = $(OPENMP)
-DEFINES  += -D_GNU_SOURCE
+DEFINES  += -D_GNU_SOURCE# -DVERBOSE
 INCLUDES = -I/Users/jan/.local/include
diff --git a/src/comm.c b/src/comm.c
index 7957178..a006daa 100644
--- a/src/comm.c
+++ b/src/comm.c
@@ -3,6 +3,7 @@
  * Use of this source code is governed by a MIT style
  * license that can be found in the LICENSE file. */
 #include "util.h"
+#include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 
@@ -19,33 +20,179 @@ static int sizeOfRank(int rank, int size, int N)
   return N / size + ((N % size > rank) ? 1 : 0);
 }
 
-void commReduction(double* v, int op)
+// Ensure that all the neighbors we expect to receive from also send to us
+// sendList - All ranks receive values from us
+// numSendNeighbors - Number of entries in sendList
+// recvList - All ranks that send values from us
+// numRecvNeighbors - Number of entries in recvList
+// FIXME: What if ranks want to send to us and are not in sendlist?
+static void probeNeighbors(
+    int* sendList, int numSendNeighbors, int* recvList, int numRecvNeighbors)
 {
-#ifdef _MPI
-  if (op == MAX) {
-    MPI_Allreduce(MPI_IN_PLACE, v, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
-  } else if (op == SUM) {
-    MPI_Allreduce(MPI_IN_PLACE, v, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+
+  int val;
+
+  for (int i = 0; i < numSendNeighbors; i++) {
+    sendList[i] = -1;
+  }
+
+  int MPI_MY_TAG = 99;
+  MPI_Request request[MAX_NUM_MESSAGES];
+
+  for (int i = 0; i < numSendNeighbors; i++) {
+    MPI_Irecv(&val,
+        1,
+        MPI_INT,
+        MPI_ANY_SOURCE,
+        MPI_MY_TAG,
+        MPI_COMM_WORLD,
+        request + i);
+  }
+
+  for (int i = 0; i < numRecvNeighbors; i++) {
+    MPI_Send(&val, 1, MPI_INT, recvList[i], MPI_MY_TAG, MPI_COMM_WORLD);
+  }
+
+  // Receive message from each send neighbor to construct 'sendList'.
+  MPI_Status status;
+  for (int i = 0; i < numSendNeighbors; i++) {
+    if (MPI_Wait(request + i, &status)) {
+      printf("MPI_Wait error\n");
+      MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
+      exit(EXIT_FAILURE);
+    }
+    sendList[i] = status.MPI_SOURCE;
+  }
+}
+
+static void buildNeighbors(Comm* c, int* external_processor)
+{
+  int numNeighbors = c->numNeighbors;
+  int numExternal  = c->numExternal;
+  int* neighbors   = c->neighbors;
+  int lengths[numNeighbors];
+  int MPI_MY_TAG = 100;
+  MPI_Request request[MAX_NUM_MESSAGES];
+
+  // First post receives
+  for (int i = 0; i < numNeighbors; i++) {
+    MPI_Irecv(lengths + i,
+        1,
+        MPI_INT,
+        neighbors[i],
+        MPI_MY_TAG,
+        MPI_COMM_WORLD,
+        request + i);
+  }
+
+  int* recvCount = c->recvCount;
+  int* sendCount = c->sendCount;
+
+  int j = 0;
+
+  for (int i = 0; i < numNeighbors; i++) {
+    int count = 0;
+
+    // go through list of external elements until updating
+    // processor changes
+    while ((j < numExternal) && (external_processor[j] == neighbors[i])) {
+      count++;
+      j++;
+      if (j == numExternal) break;
+    }
+
+    recvCount[i] = count;
+    MPI_Send(&count, 1, MPI_INT, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD);
+  }
+
+  MPI_Status status;
+  // Complete the receives of the number of externals
+  for (int i = 0; i < numNeighbors; i++) {
+    if (MPI_Wait(request + i, &status)) {
+      printf("MPI_Wait error\n");
+      exit(EXIT_FAILURE);
+    }
+    sendCount[i] = lengths[i];
+  }
+}
+
+static void buildElementsToSend(
+    Comm* c, int startRow, int* external_processor, int* new_external)
+{
+  int numNeighbors = c->numNeighbors;
+  int numExternal  = c->numExternal;
+  int* neighbors   = c->neighbors;
+  int MPI_MY_TAG   = 100;
+  MPI_Request request[MAX_NUM_MESSAGES];
+  c->elementsToSend   = (int*)allocate(ARRAY_ALIGNMENT,
+      c->totalSendCount * sizeof(int));
+  int* elementsToSend = c->elementsToSend;
+
+  int j = 0;
+
+  for (int i = 0; i < numNeighbors; i++) {
+    MPI_Irecv(elementsToSend + j,
+        c->sendCount[i],
+        MPI_INT,
+        neighbors[i],
+        MPI_MY_TAG,
+        MPI_COMM_WORLD,
+        request + i);
+
+    j += c->sendCount[i];
+  }
+
+  j = 0;
+
+  for (int i = 0; i < numNeighbors; i++) {
+    int start = j;
+
+    // Go through list of external elements
+    // until updating processor changes.  This is redundant, but
+    // saves us from recording this information.
+    while ((j < numExternal) && (external_processor[j] == neighbors[i])) {
+      j++;
+      if (j == numExternal) break;
+    }
+
+    MPI_Send(new_external + start,
+        j - start,
+        MPI_INT,
+        neighbors[i],
+        MPI_MY_TAG,
+        MPI_COMM_WORLD);
+  }
+
+  MPI_Status status;
+  // receive from each neighbor the global index list of external elements
+  for (int i = 0; i < numNeighbors; i++) {
+    if (MPI_Wait(request + i, &status)) {
+      printf("MPI_Wait error\n");
+      exit(EXIT_FAILURE);
+    }
+  }
+
+  /// replace global indices by local indices
+  for (int i = 0; i < c->totalSendCount; i++) {
+    elementsToSend[i] -= startRow;
   }
-#endif
 }
 
 void commPartition(Comm* c, Matrix* A)
 {
 #ifdef _MPI
-  int rank      = c->rank;
-  int size      = c->size;
-  MPI_Comm comm = c->comm;
+  int rank = c->rank;
+  int size = c->size;
 
   // Extract Matrix pieces
-  int start_row       = A->startRow;
-  int stop_row        = A->stopRow;
-  int total_nrow      = A->totalNr;
-  long long total_nnz = A->totalNnz;
-  int local_nrow      = A->nr;
-  int local_nnz       = A->nnz;
-  int* row_ptr        = (int*)A->rowPtr;
-  int* col_ind        = (int*)A->colInd;
+  CG_UINT start_row  = A->startRow;
+  CG_UINT stop_row   = A->stopRow;
+  CG_UINT total_nrow = A->totalNr;
+  CG_UINT total_nnz  = A->totalNnz;
+  CG_UINT local_nrow = A->nr;
+  CG_UINT local_nnz  = A->nnz;
+  CG_UINT* row_ptr   = A->rowPtr;
+  CG_UINT* col_ind   = A->colInd;
 
   // We need to convert the index values for the rows on this processor
   // to a local index space. We need to:
@@ -58,48 +205,54 @@ void commPartition(Comm* c, Matrix* A)
   //     - find out which processor owns the value.
   //     - Set up communication for sparse MV operation.
 
-  // Scan the indices and transform to local
-  int* externals   = (int*)allocate(ARRAY_ALIGNMENT, A->totalNr * sizeof(int));
-  int num_external = 1;
+  // FIXME: Use a lookup table with size total number of rows. For lower memory
+  // consumption a map would be better choice.
+  int* externals   = (int*)allocate(ARRAY_ALIGNMENT, total_nrow * sizeof(int));
+  int num_external = 0; // local number of external indices
+
+  // column indices that are not processed yet are marked with -1
+  for (int i = 0; i < total_nrow; i++) {
+    externals[i] = -1;
+  }
 
   int* external_index = (int*)allocate(ARRAY_ALIGNMENT,
       MAX_EXTERNAL * sizeof(int));
-  c->external_index   = external_index;
-
-  for (int i = 0; i < A->totalNr; i++) {
-    externals[i] = -1;
-  }
 
   for (int i = 0; i < local_nrow; i++) {
     for (int j = row_ptr[i]; j < row_ptr[i + 1]; j++) {
       int cur_ind = A->colInd[j];
 
 #ifdef VERBOSE
-      printf("Process %d of %d getting index %d in local row %d\n",
+      printf("Rank %d of %d getting entry %d:index %d in local row %d\n",
           rank,
           size,
+          j,
           cur_ind,
           i);
 #endif
 
-      // shift local rows to the start
+      // convert local column references to local numbering
       if (start_row <= cur_ind && cur_ind <= stop_row) {
         col_ind[j] -= start_row;
       } else {
-        // Must find out if we have already set up this point
+        // find out if we have already set up this point
         if (externals[cur_ind] == -1) {
-          externals[cur_ind] = num_external++;
+          externals[cur_ind] = num_external;
 
           if (num_external <= MAX_EXTERNAL) {
-            external_index[num_external - 1] = cur_ind;
-            // Mark index as external by negating it
-            col_ind[j] = -(col_ind[j] + 1); // FIXME: Offset?
+            external_index[num_external] = cur_ind;
+            // mark in local column index as external by negating it
+            // col_ind[j] = -(col_ind[j] + 1); // FIXME: Offset?
+            col_ind[j] = -col_ind[j];
           } else {
             printf("Must increase MAX_EXTERNAL\n");
+            MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
             exit(EXIT_FAILURE);
           }
+          num_external++;
         } else {
-          // Mark index as external by negating it
+          // Mark index as external by adding 1 and negating it
+          // col_ind[j] = -(col_ind[j] + 1); // FIXME: Offset?
           col_ind[j] = -col_ind[j];
         }
       }
@@ -109,44 +262,42 @@ void commPartition(Comm* c, Matrix* A)
   /**************************************************************************
   Go through list of externals to find out which processors must be accessed.
   **************************************************************************/
-  c->num_external = num_external;
-  int tmp_buffer[size];
-  int global_index_offsets[size];
-
-  for (int i = 0; i < size; i++) {
-    tmp_buffer[i] = 0;
-  }
-
-  tmp_buffer[rank] = start_row;
-
-  MPI_Allreduce(tmp_buffer,
-      global_index_offsets,
-      size,
-      MPI_INT,
-      MPI_SUM,
-      MPI_COMM_WORLD);
-
-  // Go through list of externals and find the processor that owns each
   int external_processor[num_external];
 
-  for (int i = 0; i < num_external; i++) {
-    int cur_ind = external_index[i];
-    for (int j = size - 1; j >= 0; j--) {
-      if (global_index_offsets[j] <= cur_ind) {
-        external_processor[i] = j;
-        break;
+  {
+    int globalIndexOffsets[size];
+
+    MPI_Allgather(&start_row,
+        1,
+        MPI_INT,
+        globalIndexOffsets,
+        1,
+        MPI_INT,
+        MPI_COMM_WORLD);
+
+    // for (int i = 0; i < size; i++) {
+    //   printf("Rank %d: i = %d: OFFSET %d\n", rank, i, globalIndexOffsets[i]);
+    // }
+
+    // Go through list of externals and find the processor that owns each
+
+    for (int i = 0; i < num_external; i++) {
+      int globalIndex = external_index[i];
+      for (int j = size - 1; j >= 0; j--) {
+        if (globalIndexOffsets[j] <= globalIndex) {
+          external_processor[i] = j;
+          break;
+        }
       }
     }
   }
-
   /*Go through the external elements. For each newly encountered external
   point assign it the next index in the local sequence. Then look for other
-  external elements who are updated by the same node and assign them the next
+  external elements who are updated by the same rank and assign them the next
   set of index numbers in the local sequence (ie. elements updated by the same
-  node have consecutive indices).*/
+  rank have consecutive indices).*/
   int* external_local_index = (int*)allocate(ARRAY_ALIGNMENT,
-      MAX_EXTERNAL * sizeof(int));
-  c->external_local_index   = external_local_index;
+      num_external * sizeof(int));
 
   int count = local_nrow;
 
@@ -172,27 +323,39 @@ void commPartition(Comm* c, Matrix* A)
   for (int i = 0; i < local_nrow; i++) {
     for (int j = rowPtr[i]; j < rowPtr[i + 1]; j++) {
       if (col_ind[j] < 0) {
-        int cur_ind = -col_ind[j] - 1; // FIXME: Offset by 1??
+        // size_t cur_ind = -(col_ind[j] - 1); // FIXME: Offset by 1??
+        int cur_ind = -col_ind[j];
         col_ind[j]  = external_local_index[externals[cur_ind]];
       }
     }
   }
 
-  int new_external_processor[num_external];
-
-  for (int i = 0; i < num_external; i++) {
-    new_external_processor[i] = 0;
-  }
-
-  // setup map from external id to partition
-  for (int i = 0; i < num_external; i++) {
-    new_external_processor[external_local_index[i] - local_nrow] =
-        external_processor[i];
-  }
-
+  free(externals);
+  // int new_external_processor[num_external];
+  //
+  // for (int i = 0; i < num_external; i++) {
+  //   new_external_processor[i] = -1;
+  // }
+  //
+  // // setup map from external id to partition
+  // for (int i = 0; i < num_external; i++) {
+  //   int id                     = external_local_index[i] - local_nrow;
+  //   new_external_processor[id] = external_processor[i];
+  //   printf("Rank %d of %d: %d new_external_processor[%d] = %d\n",
+  //       rank,
+  //       size,
+  //       i,
+  //       id,
+  //       external_processor[i]);
+  // }
+  // commFinalize(c);
+  // exit(EXIT_SUCCESS);
+  //
 #ifdef VERBOSE
+  printf("Rank %d of %d: %d externals\n", rank, size, num_external);
+
   for (int i = 0; i < num_external; i++) {
-    printf("Process %d of %d: external process[%d] = %d\n",
+    printf("Rank %d of %d: external[%d] owned by %d\n",
         rank,
         size,
         i,
@@ -217,119 +380,93 @@ void commPartition(Comm* c, Matrix* A)
   int num_recv_neighbors = 0;
   int length             = 1;
 
+  // Encoding both number of ranks that need values from this rank and the total
+  // number of values by adding one for any additional rank and adding size for
+  // every additional value.
   for (int i = 0; i < num_external; i++) {
-    if (tmp_neighbors[new_external_processor[i]] == 0) {
+    if (tmp_neighbors[external_processor[i]] == 0) {
       num_recv_neighbors++;
-      tmp_neighbors[new_external_processor[i]] = 1;
+      tmp_neighbors[external_processor[i]] = 1;
     }
-    tmp_neighbors[new_external_processor[i]] += size;
+    tmp_neighbors[external_processor[i]] += size;
   }
 
   // sum over all processors all the tmp_neighbors arrays
-  MPI_Allreduce(tmp_neighbors,
-      tmp_buffer,
+  MPI_Allreduce(MPI_IN_PLACE,
+      tmp_neighbors,
       size,
       MPI_INT,
       MPI_SUM,
       MPI_COMM_WORLD);
 
-  /* decode the combined 'tmp_neighbors' (stored in tmp_buffer) array from all
-   * the processors */
-  int num_send_neighbors = tmp_buffer[rank] % size;
+  /* decode the combined 'tmp_neighbors'  array from all ranks */
+  // Number of ranks that receive values from us
+  int num_send_neighbors = tmp_neighbors[rank] % size;
 
-  /* decode 'tmp_buffer[rank] to deduce total number of elements we must send */
-  int total_to_be_sent = (tmp_buffer[rank] - num_send_neighbors) / size;
+  /* decode 'tmp_neighbors[rank] to deduce total number of elements we must send
+   */
+  c->totalSendCount = (tmp_neighbors[rank] - num_send_neighbors) / size;
 
-  /* Check to see if we have enough workspace allocated.  This could be
+  /* Check to see if we have enough memory allocated.  This could be
    dynamically modified, but let's keep it simple for now...*/
   if (num_send_neighbors > MAX_NUM_MESSAGES) {
     printf("Must increase MAX_NUM_MESSAGES. Must be at least %d\n",
         num_send_neighbors);
+    MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
     exit(EXIT_FAILURE);
   }
 
-  if (total_to_be_sent > MAX_EXTERNAL) {
+  if (c->totalSendCount > MAX_EXTERNAL) {
     printf("Must increase MAX_EXTERNAL. Must be at least %d\n",
-        total_to_be_sent);
+        c->totalSendCount);
+    MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
     exit(EXIT_FAILURE);
   }
 
 #ifdef VERBOSE
-  cout << "Processor " << rank << " of " << size
-       << ": Number of send neighbors = " << num_send_neighbors << endl;
-
-  cout << "Processor " << rank << " of " << size
-       << ": Number of receive neighbors = " << num_recv_neighbors << endl;
-
-  cout << "Processor " << rank << " of " << size
-       << ": Total number of elements to send = " << total_to_be_sent << endl;
-
+  printf("Rank %d of %d: tmp_neighbors = %d\n",
+      rank,
+      size,
+      tmp_neighbors[rank]);
+  printf("Rank %d of %d: Number of send neighbors = %d\n",
+      rank,
+      size,
+      num_send_neighbors);
+  printf("Rank %d of %d: Number of receive neighbors = %d\n",
+      rank,
+      size,
+      num_recv_neighbors);
+  printf("Rank %d of %d: Total number of elements to send = %d\n",
+      rank,
+      size,
+      c->totalSendCount);
   MPI_Barrier(MPI_COMM_WORLD);
 #endif
 
   /* Make a list of the neighbors that will send information to update our
    external elements (in the order that we will receive this information).*/
-  int* recv_list = allocate(ARRAY_ALIGNMENT, MAX_EXTERNAL * sizeof(int));
+  int* recv_list = allocate(ARRAY_ALIGNMENT, MAX_NUM_MESSAGES * sizeof(int));
 
-  // FIXME: Create local scope
-  int j          = 0;
-  recv_list[j++] = new_external_processor[0];
+  {
+    int j          = 0;
+    recv_list[j++] = external_processor[0];
 
-  for (int i = 1; i < num_external; i++) {
-    if (new_external_processor[i - 1] != new_external_processor[i]) {
-      recv_list[j++] = new_external_processor[i];
+    for (int i = 1; i < num_external; i++) {
+      if (external_processor[i - 1] != external_processor[i]) {
+        recv_list[j++] = external_processor[i];
+      }
     }
   }
 
   // Ensure that all the neighbors we expect to receive from also send to us
-  // Send a 0 length message to each of our recv neighbors
   int send_list[num_send_neighbors];
 
-  for (int i = 0; i < num_send_neighbors; i++) {
-    send_list[i] = 0;
-  }
-
-  //  first post receives, these are immediate receives
-  //  Do not wait for result to come, will do that at the
-  //  wait call below.
-  int MPI_MY_TAG = 99;
-
-  MPI_Request request[MAX_NUM_MESSAGES];
-
-  for (int i = 0; i < num_send_neighbors; i++) {
-    MPI_Irecv(tmp_buffer + i,
-        1,
-        MPI_INT,
-        MPI_ANY_SOURCE,
-        MPI_MY_TAG,
-        MPI_COMM_WORLD,
-        request + i);
-  }
-
-  // send messages
-  for (int i = 0; i < num_recv_neighbors; i++) {
-    MPI_Send(tmp_buffer + i,
-        1,
-        MPI_INT,
-        recv_list[i],
-        MPI_MY_TAG,
-        MPI_COMM_WORLD);
-  }
-
-  // Receive message from each send neighbor to construct 'send_list'.
-  MPI_Status status;
-  for (int i = 0; i < num_send_neighbors; i++) {
-    if (MPI_Wait(request + i, &status)) {
-      printf("MPI_Wait error\n");
-      exit(EXIT_FAILURE);
-    }
-    send_list[i] = status.MPI_SOURCE;
-  }
+  probeNeighbors(send_list, num_send_neighbors, recv_list, num_recv_neighbors);
 
   /*  Compare the two lists. In most cases they should be the same.
   //  However, if they are not then add new entries to the recv list
   //  that are in the send list (but not already in the recv list).
-  WHY!! This ensures that the sendlist is equal to the sendlist
+  FIXME: WHY!! This ensures that the recv_list is equal to the sendlist
   But why is this required? -> Just One neighbour list??*/
   for (int j = 0; j < num_send_neighbors; j++) {
     int found = 0;
@@ -343,29 +480,23 @@ void commPartition(Comm* c, Matrix* A)
           rank,
           size,
           num_recv_neighbors,
-          send_list[i]);
+          send_list[j]);
 #endif
       recv_list[num_recv_neighbors] = send_list[j];
-      (num_recv_neighbors)++;
+      num_recv_neighbors++;
     }
   }
-  num_send_neighbors = num_recv_neighbors;
 
-  if (num_send_neighbors > MAX_NUM_MESSAGES) {
-    printf("Must increase MAX_EXTERNAL\n");
-    exit(EXIT_FAILURE);
+  // From here on only have one neighbor list for both send and recv
+  c->numNeighbors = num_recv_neighbors;
+  for (int i = 0; i < c->numNeighbors; i++) {
+    c->neighbors[i] = recv_list[i];
   }
 
-  // Start filling communication setup
-  // Create 'new_external' which explicitly put the external elements in the
-  // order given by 'external_local_index'
-  c->total_to_be_sent   = total_to_be_sent;
-  int* elements_to_send = (int*)allocate(ARRAY_ALIGNMENT,
-      total_to_be_sent * sizeof(int));
-  c->elements_to_send   = elements_to_send;
-
-  for (int i = 0; i < total_to_be_sent; i++) {
-    elements_to_send[i] = 0;
+  if (c->numNeighbors > MAX_NUM_MESSAGES) {
+    printf("Must increase MAX_EXTERNAL\n");
+    MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
+    exit(EXIT_FAILURE);
   }
 
   // Create 'new_external' which explicitly put the external elements in the
@@ -377,195 +508,100 @@ void commPartition(Comm* c, Matrix* A)
     new_external[external_local_index[i] - local_nrow] = external_index[i];
   }
 
+  free(external_local_index);
+  free(external_index);
+  c->numExternal = num_external;
+
+  buildNeighbors(c, external_processor);
+
   // Send each processor the global index list of the external elements in the
   // order that I will want to receive them when updating my external elements
-  int lengths[num_recv_neighbors];
-  MPI_MY_TAG++;
+  // Build "elementsToSend" list. These are the x elements the current rank
+  // owns that need to be sent to other ranks
+  buildElementsToSend(c, A->startRow, external_processor, new_external);
 
-  // First post receives
-  for (int i = 0; i < num_recv_neighbors; i++) {
-    int partner = recv_list[i];
-    MPI_Irecv(lengths + i,
-        1,
-        MPI_INT,
-        partner,
-        MPI_MY_TAG,
-        MPI_COMM_WORLD,
-        request + i);
-  }
-
-  int* neighbors   = c->neighbors;
-  int* recv_length = c->recv_length;
-  int* send_length = c->send_length;
-
-  j = 0;
-
-  for (int i = 0; i < num_recv_neighbors; i++) {
-    int start     = j;
-    int newlength = 0;
-
-    // go through list of external elements until updating
-    // processor changes
-    while ((j < num_external) && (new_external_processor[j] == recv_list[i])) {
-      newlength++;
-      j++;
-      if (j == num_external) break;
-    }
-
-    recv_length[i] = newlength;
-    neighbors[i]   = recv_list[i];
-
-    length = j - start;
-    MPI_Send(&length, 1, MPI_INT, recv_list[i], MPI_MY_TAG, MPI_COMM_WORLD);
-  }
-
-  // Complete the receives of the number of externals
-  for (int i = 0; i < num_recv_neighbors; i++) {
-    if (MPI_Wait(request + i, &status)) {
-      printf("MPI_Wait error\n");
-      exit(EXIT_FAILURE);
-    }
-    send_length[i] = lengths[i];
-  }
-
-  // Build "elements_to_send" list.  These are the x elements I own
-  // that need to be sent to other processors.
-  MPI_MY_TAG++;
-
-  j = 0;
-
-  for (int i = 0; i < num_recv_neighbors; i++) {
-    MPI_Irecv(elements_to_send + j,
-        send_length[i],
-        MPI_INT,
-        neighbors[i],
-        MPI_MY_TAG,
-        MPI_COMM_WORLD,
-        request + i);
-    j += send_length[i];
-  }
-
-  j = 0;
-
-  for (int i = 0; i < num_recv_neighbors; i++) {
-    int start     = j;
-    int newlength = 0;
-
-    // Go through list of external elements
-    // until updating processor changes.  This is redundant, but
-    // saves us from recording this information.
-    while ((j < num_external) && (new_external_processor[j] == recv_list[i])) {
-
-      newlength++;
-      j++;
-      if (j == num_external) break;
-    }
-    MPI_Send(new_external + start,
-        j - start,
-        MPI_INT,
-        recv_list[i],
-        MPI_MY_TAG,
-        MPI_COMM_WORLD);
-  }
-
-  // receive from each neighbor the global index list of external elements
-  for (int i = 0; i < num_recv_neighbors; i++) {
-    if (MPI_Wait(request + i, &status)) {
-      printf("MPI_Wait error\n");
-      exit(EXIT_FAILURE);
-    }
-  }
-
-  /// replace global indices by local indices
-  for (int i = 0; i < total_to_be_sent; i++) {
-    elements_to_send[i] -= start_row;
-  }
-
-  // Finish up !!
-  c->num_send_neighbors = num_send_neighbors;
-  A->nc                 = A->nc + num_external;
-
-  // Used in exchange
-  CG_FLOAT* send_buffer = (CG_FLOAT*)allocate(ARRAY_ALIGNMENT,
-      total_to_be_sent * sizeof(CG_FLOAT));
-  c->send_buffer        = send_buffer;
+  A->nc          = A->nc + num_external;
+  c->send_buffer = (CG_FLOAT*)allocate(ARRAY_ALIGNMENT,
+      c->totalSendCount * sizeof(CG_FLOAT));
 
   free(recv_list);
   free(new_external);
 #endif
 }
 
-void commExchange(Comm* c, Matrix* A, double* x)
+void commExchange(Comm* c, Matrix* A, CG_FLOAT* x)
 {
 #ifdef _MPI
-  int num_external = 0;
+  int numNeighbors     = c->numNeighbors;
+  int* neighbors       = c->neighbors;
+  int* recvCount       = c->recvCount;
+  int* sendCount       = c->sendCount;
+  CG_FLOAT* sendBuffer = c->send_buffer;
+  int* elementsToSend  = c->elementsToSend;
 
-  // Extract Matrix pieces
-
-  int local_nrow        = A->nr;
-  int num_neighbors     = c->num_send_neighbors;
-  int* recv_length      = c->recv_length;
-  int* send_length      = c->send_length;
-  int* neighbors        = c->neighbors;
-  double* send_buffer   = c->send_buffer;
-  int total_to_be_sent  = c->total_to_be_sent;
-  int* elements_to_send = c->elements_to_send;
-
-  int rank      = c->rank;
-  int size      = c->size;
-  MPI_Comm comm = c->comm;
-
-  //  first post receives, these are immediate receives
-  //  Do not wait for result to come, will do that at the
-  //  wait call below.
   int MPI_MY_TAG = 99;
-
-  MPI_Request request[num_neighbors];
+  MPI_Request request[numNeighbors];
 
   // Externals are at end of locals
-  double* x_external = (double*)x + local_nrow;
+  CG_FLOAT* externals = x + A->nr;
 
-  // Post receives first
-  for (int i = 0; i < num_neighbors; i++) {
-    int n_recv = recv_length[i];
-    MPI_Irecv(x_external,
-        n_recv,
-        MPI_DOUBLE,
+  // Post receives
+  for (int i = 0; i < numNeighbors; i++) {
+    int count = recvCount[i];
+
+    MPI_Irecv(externals,
+        count,
+        MPI_FLOAT_TYPE,
         neighbors[i],
         MPI_MY_TAG,
         MPI_COMM_WORLD,
         request + i);
-    x_external += n_recv;
+
+    externals += count;
   }
 
-  // Fill up send buffer
-  for (int i = 0; i < total_to_be_sent; i++) {
-    send_buffer[i] = x[elements_to_send[i]];
+  // Copy values for all ranks into send buffer
+  for (int i = 0; i < c->totalSendCount; i++) {
+    sendBuffer[i] = x[elementsToSend[i]];
   }
 
   // Send to each neighbor
-  for (int i = 0; i < num_neighbors; i++) {
-    int n_send = send_length[i];
-    MPI_Send(send_buffer,
-        n_send,
-        MPI_DOUBLE,
+  for (int i = 0; i < numNeighbors; i++) {
+    int count = sendCount[i];
+
+    MPI_Send(sendBuffer,
+        count,
+        MPI_FLOAT_TYPE,
         neighbors[i],
         MPI_MY_TAG,
         MPI_COMM_WORLD);
-    send_buffer += n_send;
+
+    sendBuffer += count;
   }
 
-  // Complete the reads issued above
+  // Complete the receives issued above
   MPI_Status status;
-  for (int i = 0; i < num_neighbors; i++) {
+  for (int i = 0; i < numNeighbors; i++) {
     if (MPI_Wait(request + i, &status)) {
       printf("MPI_Wait error\n");
+      MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
       exit(EXIT_FAILURE);
     }
   }
 #endif
 }
 
+void commReduction(double* v, int op)
+{
+#ifdef _MPI
+  if (op == MAX) {
+    MPI_Allreduce(MPI_IN_PLACE, v, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+  } else if (op == SUM) {
+    MPI_Allreduce(MPI_IN_PLACE, v, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  }
+#endif
+}
+
 void commPrintConfig(Comm* c)
 {
 #ifdef _MPI
@@ -577,6 +613,21 @@ void commPrintConfig(Comm* c)
 
   for (int i = 0; i < c->size; i++) {
     if (i == c->rank) {
+      printf("Rank %d has %d neighbors with %d externals:\n",
+          c->rank,
+          c->numNeighbors,
+          c->numExternal);
+      for (int j = 0; j < c->numNeighbors; j++) {
+        printf("\t%d: receive %d send %d\n",
+            c->neighbors[j],
+            c->recvCount[j],
+            c->sendCount[j]);
+      }
+      printf("\tSend %d elements: [", c->totalSendCount);
+      for (int j = 0; j < c->totalSendCount; j++) {
+        printf("%d, ", c->elementsToSend[j]);
+      }
+      printf("]\n");
       fflush(stdout);
     }
     MPI_Barrier(MPI_COMM_WORLD);
@@ -594,14 +645,14 @@ void commMatrixDump(Comm* c, Matrix* m)
   CG_FLOAT* val   = m->val;
 
   if (commIsMaster(c)) {
-    printf("Matrix: %lld total non zeroes, total number of rows %lld\n",
+    printf("Matrix: %d total non zeroes, total number of rows %d\n",
         m->totalNnz,
         m->totalNr);
   }
 
   for (int i = 0; i < size; i++) {
     if (i == rank) {
-      printf("Rank %d: %lld non zeroes, number of rows %lld\n",
+      printf("Rank %d: %d non zeroes, number of rows %d\n",
           rank,
           m->nnz,
           numRows);
@@ -609,9 +660,9 @@ void commMatrixDump(Comm* c, Matrix* m)
       for (int rowID = 0; rowID < numRows; rowID++) {
         printf("Row [%d]: ", rowID);
 
-        for (size_t rowEntry = rowPtr[rowID]; rowEntry < rowPtr[rowID + 1];
+        for (int rowEntry = rowPtr[rowID]; rowEntry < rowPtr[rowID + 1];
             rowEntry++) {
-          printf("[%lld]:%.2f ", colInd[rowEntry], val[rowEntry]);
+          printf("[%d]:%.2f ", colInd[rowEntry], val[rowEntry]);
         }
 
         printf("\n");
diff --git a/src/comm.h b/src/comm.h
index 7821ff9..1bff459 100644
--- a/src/comm.h
+++ b/src/comm.h
@@ -4,6 +4,7 @@
  * license that can be found in the LICENSE file. */
 #ifndef __COMM_H_
 #define __COMM_H_
+#include "util.h"
 #if defined(_MPI)
 #include <mpi.h>
 #endif
@@ -20,18 +21,14 @@ typedef struct {
   int rank;
   int size;
 #if defined(_MPI)
-  MPI_Comm comm;
-
-  int num_external;
-  int num_send_neighbors;
-  int* external_index;
-  int* external_local_index;
-  int total_to_be_sent;
-  int* elements_to_send;
+  int numNeighbors;
+  int numExternal;
+  int totalSendCount;
+  int* elementsToSend;
   int neighbors[MAX_NUM_NEIGHBOURS];
-  int recv_length[MAX_NUM_NEIGHBOURS];
-  int send_length[MAX_NUM_NEIGHBOURS];
-  double* send_buffer;
+  int recvCount[MAX_NUM_NEIGHBOURS];
+  int sendCount[MAX_NUM_NEIGHBOURS];
+  CG_FLOAT* send_buffer;
 #endif
 } Comm;
 
diff --git a/src/main.c b/src/main.c
index 31afc64..d2c63eb 100644
--- a/src/main.c
+++ b/src/main.c
@@ -55,9 +55,9 @@ int main(int argc, char** argv)
   CG_FLOAT eps = (CG_FLOAT)param.eps;
   int itermax  = param.itermax;
   initSolver(&s, &comm, &param);
-  commMatrixDump(&comm, &s.A);
-  commFinalize(&comm);
-  exit(EXIT_SUCCESS);
+  // commMatrixDump(&comm, &s.A);
+  commPartition(&comm, &s.A);
+  commPrintConfig(&comm);
 
   CG_UINT nrow = s.A.nr;
   CG_UINT ncol = s.A.nc;
diff --git a/src/solver.c b/src/solver.c
index c071190..81e1fed 100644
--- a/src/solver.c
+++ b/src/solver.c
@@ -109,7 +109,7 @@ static void matrixGenerate(
 #ifdef VERBOSE
   printf("Process %d of %d has %d rows\n", rank, size, local_nrow);
   printf("Global rows %d through %d\n", start_row, stop_row);
-  printf("%d nonzeros\n", start_row, stop_row);
+  printf("%d nonzeros\n", local_nnz);
 #endif /* ifdef VERBOSE */
 
   s->A.startRow = start_row;
@@ -139,11 +139,11 @@ void spMVM(Matrix* m, const CG_FLOAT* restrict x, CG_FLOAT* restrict y)
   CG_UINT* colInd = m->colInd;
   CG_FLOAT* val   = m->val;
 
-  for (size_t rowID = 0; rowID < numRows; rowID++) {
+  for (int rowID = 0; rowID < numRows; rowID++) {
     CG_FLOAT tmp = y[rowID];
 
     // loop over all elements in row
-    for (size_t rowEntry = rowPtr[rowID]; rowEntry < rowPtr[rowID + 1];
+    for (int rowEntry = rowPtr[rowID]; rowEntry < rowPtr[rowID + 1];
         rowEntry++) {
       tmp += val[rowEntry] * x[colInd[rowEntry]];
     }
@@ -160,15 +160,15 @@ void waxpby(const CG_UINT n,
     CG_FLOAT* const w)
 {
   if (alpha == 1.0) {
-    for (size_t i = 0; i < n; i++) {
+    for (int i = 0; i < n; i++) {
       w[i] = x[i] + beta * y[i];
     }
   } else if (beta == 1.0) {
-    for (size_t i = 0; i < n; i++) {
+    for (int i = 0; i < n; i++) {
       w[i] = alpha * x[i] + y[i];
     }
   } else {
-    for (size_t i = 0; i < n; i++) {
+    for (int i = 0; i < n; i++) {
       w[i] = alpha * x[i] + beta * y[i];
     }
   }
@@ -182,11 +182,11 @@ void ddot(const CG_UINT n,
   CG_FLOAT sum = 0.0;
 
   if (y == x) {
-    for (size_t i = 0; i < n; i++) {
+    for (int i = 0; i < n; i++) {
       sum += x[i] * x[i];
     }
   } else {
-    for (size_t i = 0; i < n; i++) {
+    for (int i = 0; i < n; i++) {
       sum += x[i] * y[i];
     }
   }
diff --git a/src/util.h b/src/util.h
index 1bc160d..8bc38ca 100644
--- a/src/util.h
+++ b/src/util.h
@@ -23,13 +23,16 @@
 #define MAXLINE 4096
 #endif
 
-#define CG_UINT unsigned long long int
+// #define CG_UINT unsigned long long int
+#define CG_UINT int
 
 #if PRECISION == 1
 #define CG_FLOAT         float
+#define MPI_FLOAT_TYPE   MPI_FLOAT
 #define PRECISION_STRING "single"
 #else
 #define CG_FLOAT         double
+#define MPI_FLOAT_TYPE   MPI_DOUBLE
 #define PRECISION_STRING "double"
 #endif