Fix bugs and clean up. Add subroutines.

2025-01-20 18:24:37 +01:00
parent 6f1932cf4f
commit 49dc2d97ae
7 changed files with 396 additions and 345 deletions
--- a/hpcg.par
+++ b/hpcg.par
@@ -3,9 +3,9 @@
 #==============================================================================

 filename generate
-nx 50
-ny 50
-nz 50
+nx 10
+ny 10
+nz 10

 itermax 10000
 eps 0.0001
--- a/mk/include_CLANG.mk
+++ b/mk/include_CLANG.mk
@@ -17,5 +17,5 @@ VERSION  = --version
 CFLAGS   = -O3 -ffast-math -std=c99 $(OPENMP)
 #CFLAGS   = -Ofast -fnt-store=aggressive  -std=c99 $(OPENMP) #AMD CLANG
 LFLAGS   = $(OPENMP)
-DEFINES  += -D_GNU_SOURCE
+DEFINES  += -D_GNU_SOURCE# -DVERBOSE
 INCLUDES = -I/Users/jan/.local/include
--- a/src/comm.c
+++ b/src/comm.c
@@ -3,6 +3,7 @@
 * Use of this source code is governed by a MIT style
 * license that can be found in the LICENSE file. */
 #include "util.h"
+#include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>

@@ -19,15 +20,162 @@ static int sizeOfRank(int rank, int size, int N)
  return N / size + ((N % size > rank) ? 1 : 0);
 }

-void commReduction(double* v, int op)
+// Ensure that all the neighbors we expect to receive from also send to us
+// sendList - All ranks receive values from us
+// numSendNeighbors - Number of entries in sendList
+// recvList - All ranks that send values from us
+// numRecvNeighbors - Number of entries in recvList
+// FIXME: What if ranks want to send to us and are not in sendlist?
+static void probeNeighbors(
+    int* sendList, int numSendNeighbors, int* recvList, int numRecvNeighbors)
 {
-#ifdef _MPI
-  if (op == MAX) {
-    MPI_Allreduce(MPI_IN_PLACE, v, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
-  } else if (op == SUM) {
-    MPI_Allreduce(MPI_IN_PLACE, v, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+
+  int val;
+
+  for (int i = 0; i < numSendNeighbors; i++) {
+    sendList[i] = -1;
+  }
+
+  int MPI_MY_TAG = 99;
+  MPI_Request request[MAX_NUM_MESSAGES];
+
+  for (int i = 0; i < numSendNeighbors; i++) {
+    MPI_Irecv(&val,
+        1,
+        MPI_INT,
+        MPI_ANY_SOURCE,
+        MPI_MY_TAG,
+        MPI_COMM_WORLD,
+        request + i);
+  }
+
+  for (int i = 0; i < numRecvNeighbors; i++) {
+    MPI_Send(&val, 1, MPI_INT, recvList[i], MPI_MY_TAG, MPI_COMM_WORLD);
+  }
+
+  // Receive message from each send neighbor to construct 'sendList'.
+  MPI_Status status;
+  for (int i = 0; i < numSendNeighbors; i++) {
+    if (MPI_Wait(request + i, &status)) {
+      printf("MPI_Wait error\n");
+      MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
+      exit(EXIT_FAILURE);
+    }
+    sendList[i] = status.MPI_SOURCE;
+  }
+}
+
+static void buildNeighbors(Comm* c, int* external_processor)
+{
+  int numNeighbors = c->numNeighbors;
+  int numExternal  = c->numExternal;
+  int* neighbors   = c->neighbors;
+  int lengths[numNeighbors];
+  int MPI_MY_TAG = 100;
+  MPI_Request request[MAX_NUM_MESSAGES];
+
+  // First post receives
+  for (int i = 0; i < numNeighbors; i++) {
+    MPI_Irecv(lengths + i,
+        1,
+        MPI_INT,
+        neighbors[i],
+        MPI_MY_TAG,
+        MPI_COMM_WORLD,
+        request + i);
+  }
+
+  int* recvCount = c->recvCount;
+  int* sendCount = c->sendCount;
+
+  int j = 0;
+
+  for (int i = 0; i < numNeighbors; i++) {
+    int count = 0;
+
+    // go through list of external elements until updating
+    // processor changes
+    while ((j < numExternal) && (external_processor[j] == neighbors[i])) {
+      count++;
+      j++;
+      if (j == numExternal) break;
+    }
+
+    recvCount[i] = count;
+    MPI_Send(&count, 1, MPI_INT, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD);
+  }
+
+  MPI_Status status;
+  // Complete the receives of the number of externals
+  for (int i = 0; i < numNeighbors; i++) {
+    if (MPI_Wait(request + i, &status)) {
+      printf("MPI_Wait error\n");
+      exit(EXIT_FAILURE);
+    }
+    sendCount[i] = lengths[i];
+  }
+}
+
+static void buildElementsToSend(
+    Comm* c, int startRow, int* external_processor, int* new_external)
+{
+  int numNeighbors = c->numNeighbors;
+  int numExternal  = c->numExternal;
+  int* neighbors   = c->neighbors;
+  int MPI_MY_TAG   = 100;
+  MPI_Request request[MAX_NUM_MESSAGES];
+  c->elementsToSend   = (int*)allocate(ARRAY_ALIGNMENT,
+      c->totalSendCount * sizeof(int));
+  int* elementsToSend = c->elementsToSend;
+
+  int j = 0;
+
+  for (int i = 0; i < numNeighbors; i++) {
+    MPI_Irecv(elementsToSend + j,
+        c->sendCount[i],
+        MPI_INT,
+        neighbors[i],
+        MPI_MY_TAG,
+        MPI_COMM_WORLD,
+        request + i);
+
+    j += c->sendCount[i];
+  }
+
+  j = 0;
+
+  for (int i = 0; i < numNeighbors; i++) {
+    int start = j;
+
+    // Go through list of external elements
+    // until updating processor changes.  This is redundant, but
+    // saves us from recording this information.
+    while ((j < numExternal) && (external_processor[j] == neighbors[i])) {
+      j++;
+      if (j == numExternal) break;
+    }
+
+    MPI_Send(new_external + start,
+        j - start,
+        MPI_INT,
+        neighbors[i],
+        MPI_MY_TAG,
+        MPI_COMM_WORLD);
+  }
+
+  MPI_Status status;
+  // receive from each neighbor the global index list of external elements
+  for (int i = 0; i < numNeighbors; i++) {
+    if (MPI_Wait(request + i, &status)) {
+      printf("MPI_Wait error\n");
+      exit(EXIT_FAILURE);
+    }
+  }
+
+  /// replace global indices by local indices
+  for (int i = 0; i < c->totalSendCount; i++) {
+    elementsToSend[i] -= startRow;
  }
-#endif
 }

 void commPartition(Comm* c, Matrix* A)
@@ -35,17 +183,16 @@ void commPartition(Comm* c, Matrix* A)
 #ifdef _MPI
  int rank = c->rank;
  int size = c->size;
-  MPI_Comm comm = c->comm;

  // Extract Matrix pieces
-  int start_row       = A->startRow;
-  int stop_row        = A->stopRow;
-  int total_nrow      = A->totalNr;
-  long long total_nnz = A->totalNnz;
-  int local_nrow      = A->nr;
-  int local_nnz       = A->nnz;
-  int* row_ptr        = (int*)A->rowPtr;
-  int* col_ind        = (int*)A->colInd;
+  CG_UINT start_row  = A->startRow;
+  CG_UINT stop_row   = A->stopRow;
+  CG_UINT total_nrow = A->totalNr;
+  CG_UINT total_nnz  = A->totalNnz;
+  CG_UINT local_nrow = A->nr;
+  CG_UINT local_nnz  = A->nnz;
+  CG_UINT* row_ptr   = A->rowPtr;
+  CG_UINT* col_ind   = A->colInd;

  // We need to convert the index values for the rows on this processor
  // to a local index space. We need to:
@@ -58,48 +205,54 @@ void commPartition(Comm* c, Matrix* A)
  //     - find out which processor owns the value.
  //     - Set up communication for sparse MV operation.

-  // Scan the indices and transform to local
-  int* externals   = (int*)allocate(ARRAY_ALIGNMENT, A->totalNr * sizeof(int));
-  int num_external = 1;
+  // FIXME: Use a lookup table with size total number of rows. For lower memory
+  // consumption a map would be better choice.
+  int* externals   = (int*)allocate(ARRAY_ALIGNMENT, total_nrow * sizeof(int));
+  int num_external = 0; // local number of external indices
+
+  // column indices that are not processed yet are marked with -1
+  for (int i = 0; i < total_nrow; i++) {
+    externals[i] = -1;
+  }

  int* external_index = (int*)allocate(ARRAY_ALIGNMENT,
      MAX_EXTERNAL * sizeof(int));
-  c->external_index   = external_index;
-
-  for (int i = 0; i < A->totalNr; i++) {
-    externals[i] = -1;
-  }

  for (int i = 0; i < local_nrow; i++) {
    for (int j = row_ptr[i]; j < row_ptr[i + 1]; j++) {
      int cur_ind = A->colInd[j];

 #ifdef VERBOSE
-      printf("Process %d of %d getting index %d in local row %d\n",
+      printf("Rank %d of %d getting entry %d:index %d in local row %d\n",
          rank,
          size,
+          j,
          cur_ind,
          i);
 #endif

-      // shift local rows to the start
+      // convert local column references to local numbering
      if (start_row <= cur_ind && cur_ind <= stop_row) {
        col_ind[j] -= start_row;
      } else {
-        // Must find out if we have already set up this point
+        // find out if we have already set up this point
        if (externals[cur_ind] == -1) {
-          externals[cur_ind] = num_external++;
+          externals[cur_ind] = num_external;

          if (num_external <= MAX_EXTERNAL) {
-            external_index[num_external - 1] = cur_ind;
-            // Mark index as external by negating it
-            col_ind[j] = -(col_ind[j] + 1); // FIXME: Offset?
+            external_index[num_external] = cur_ind;
+            // mark in local column index as external by negating it
+            // col_ind[j] = -(col_ind[j] + 1); // FIXME: Offset?
+            col_ind[j] = -col_ind[j];
          } else {
            printf("Must increase MAX_EXTERNAL\n");
+            MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
            exit(EXIT_FAILURE);
          }
+          num_external++;
        } else {
-          // Mark index as external by negating it
+          // Mark index as external by adding 1 and negating it
+          // col_ind[j] = -(col_ind[j] + 1); // FIXME: Offset?
          col_ind[j] = -col_ind[j];
        }
      }
@@ -109,44 +262,42 @@ void commPartition(Comm* c, Matrix* A)
  /**************************************************************************
  Go through list of externals to find out which processors must be accessed.
  **************************************************************************/
-  c->num_external = num_external;
-  int tmp_buffer[size];
-  int global_index_offsets[size];
-
-  for (int i = 0; i < size; i++) {
-    tmp_buffer[i] = 0;
-  }
-
-  tmp_buffer[rank] = start_row;
-
-  MPI_Allreduce(tmp_buffer,
-      global_index_offsets,
-      size,
-      MPI_INT,
-      MPI_SUM,
-      MPI_COMM_WORLD);
-
-  // Go through list of externals and find the processor that owns each
  int external_processor[num_external];

+  {
+    int globalIndexOffsets[size];
+
+    MPI_Allgather(&start_row,
+        1,
+        MPI_INT,
+        globalIndexOffsets,
+        1,
+        MPI_INT,
+        MPI_COMM_WORLD);
+
+    // for (int i = 0; i < size; i++) {
+    //   printf("Rank %d: i = %d: OFFSET %d\n", rank, i, globalIndexOffsets[i]);
+    // }
+
+    // Go through list of externals and find the processor that owns each
+
    for (int i = 0; i < num_external; i++) {
-    int cur_ind = external_index[i];
+      int globalIndex = external_index[i];
      for (int j = size - 1; j >= 0; j--) {
-      if (global_index_offsets[j] <= cur_ind) {
+        if (globalIndexOffsets[j] <= globalIndex) {
          external_processor[i] = j;
          break;
        }
      }
    }
-
+  }
  /*Go through the external elements. For each newly encountered external
  point assign it the next index in the local sequence. Then look for other
-  external elements who are updated by the same node and assign them the next
+  external elements who are updated by the same rank and assign them the next
  set of index numbers in the local sequence (ie. elements updated by the same
-  node have consecutive indices).*/
+  rank have consecutive indices).*/
  int* external_local_index = (int*)allocate(ARRAY_ALIGNMENT,
-      MAX_EXTERNAL * sizeof(int));
-  c->external_local_index   = external_local_index;
+      num_external * sizeof(int));

  int count = local_nrow;

@@ -172,27 +323,39 @@ void commPartition(Comm* c, Matrix* A)
  for (int i = 0; i < local_nrow; i++) {
    for (int j = rowPtr[i]; j < rowPtr[i + 1]; j++) {
      if (col_ind[j] < 0) {
-        int cur_ind = -col_ind[j] - 1; // FIXME: Offset by 1??
+        // size_t cur_ind = -(col_ind[j] - 1); // FIXME: Offset by 1??
+        int cur_ind = -col_ind[j];
        col_ind[j]  = external_local_index[externals[cur_ind]];
      }
    }
  }

-  int new_external_processor[num_external];
-
-  for (int i = 0; i < num_external; i++) {
-    new_external_processor[i] = 0;
-  }
-
-  // setup map from external id to partition
-  for (int i = 0; i < num_external; i++) {
-    new_external_processor[external_local_index[i] - local_nrow] =
-        external_processor[i];
-  }
-
+  free(externals);
+  // int new_external_processor[num_external];
+  //
+  // for (int i = 0; i < num_external; i++) {
+  //   new_external_processor[i] = -1;
+  // }
+  //
+  // // setup map from external id to partition
+  // for (int i = 0; i < num_external; i++) {
+  //   int id                     = external_local_index[i] - local_nrow;
+  //   new_external_processor[id] = external_processor[i];
+  //   printf("Rank %d of %d: %d new_external_processor[%d] = %d\n",
+  //       rank,
+  //       size,
+  //       i,
+  //       id,
+  //       external_processor[i]);
+  // }
+  // commFinalize(c);
+  // exit(EXIT_SUCCESS);
+  //
 #ifdef VERBOSE
+  printf("Rank %d of %d: %d externals\n", rank, size, num_external);
+
  for (int i = 0; i < num_external; i++) {
-    printf("Process %d of %d: external process[%d] = %d\n",
+    printf("Rank %d of %d: external[%d] owned by %d\n",
        rank,
        size,
        i,
@@ -217,119 +380,93 @@ void commPartition(Comm* c, Matrix* A)
  int num_recv_neighbors = 0;
  int length             = 1;

+  // Encoding both number of ranks that need values from this rank and the total
+  // number of values by adding one for any additional rank and adding size for
+  // every additional value.
  for (int i = 0; i < num_external; i++) {
-    if (tmp_neighbors[new_external_processor[i]] == 0) {
+    if (tmp_neighbors[external_processor[i]] == 0) {
      num_recv_neighbors++;
-      tmp_neighbors[new_external_processor[i]] = 1;
+      tmp_neighbors[external_processor[i]] = 1;
    }
-    tmp_neighbors[new_external_processor[i]] += size;
+    tmp_neighbors[external_processor[i]] += size;
  }

  // sum over all processors all the tmp_neighbors arrays
-  MPI_Allreduce(tmp_neighbors,
-      tmp_buffer,
+  MPI_Allreduce(MPI_IN_PLACE,
+      tmp_neighbors,
      size,
      MPI_INT,
      MPI_SUM,
      MPI_COMM_WORLD);

-  /* decode the combined 'tmp_neighbors' (stored in tmp_buffer) array from all
-   * the processors */
-  int num_send_neighbors = tmp_buffer[rank] % size;
+  /* decode the combined 'tmp_neighbors'  array from all ranks */
+  // Number of ranks that receive values from us
+  int num_send_neighbors = tmp_neighbors[rank] % size;

-  /* decode 'tmp_buffer[rank] to deduce total number of elements we must send */
-  int total_to_be_sent = (tmp_buffer[rank] - num_send_neighbors) / size;
+  /* decode 'tmp_neighbors[rank] to deduce total number of elements we must send
+   */
+  c->totalSendCount = (tmp_neighbors[rank] - num_send_neighbors) / size;

-  /* Check to see if we have enough workspace allocated.  This could be
+  /* Check to see if we have enough memory allocated.  This could be
   dynamically modified, but let's keep it simple for now...*/
  if (num_send_neighbors > MAX_NUM_MESSAGES) {
    printf("Must increase MAX_NUM_MESSAGES. Must be at least %d\n",
        num_send_neighbors);
+    MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
    exit(EXIT_FAILURE);
  }

-  if (total_to_be_sent > MAX_EXTERNAL) {
+  if (c->totalSendCount > MAX_EXTERNAL) {
    printf("Must increase MAX_EXTERNAL. Must be at least %d\n",
-        total_to_be_sent);
+        c->totalSendCount);
+    MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
    exit(EXIT_FAILURE);
  }

 #ifdef VERBOSE
-  cout << "Processor " << rank << " of " << size
-       << ": Number of send neighbors = " << num_send_neighbors << endl;
-
-  cout << "Processor " << rank << " of " << size
-       << ": Number of receive neighbors = " << num_recv_neighbors << endl;
-
-  cout << "Processor " << rank << " of " << size
-       << ": Total number of elements to send = " << total_to_be_sent << endl;
-
+  printf("Rank %d of %d: tmp_neighbors = %d\n",
+      rank,
+      size,
+      tmp_neighbors[rank]);
+  printf("Rank %d of %d: Number of send neighbors = %d\n",
+      rank,
+      size,
+      num_send_neighbors);
+  printf("Rank %d of %d: Number of receive neighbors = %d\n",
+      rank,
+      size,
+      num_recv_neighbors);
+  printf("Rank %d of %d: Total number of elements to send = %d\n",
+      rank,
+      size,
+      c->totalSendCount);
  MPI_Barrier(MPI_COMM_WORLD);
 #endif

  /* Make a list of the neighbors that will send information to update our
   external elements (in the order that we will receive this information).*/
-  int* recv_list = allocate(ARRAY_ALIGNMENT, MAX_EXTERNAL * sizeof(int));
+  int* recv_list = allocate(ARRAY_ALIGNMENT, MAX_NUM_MESSAGES * sizeof(int));

-  // FIXME: Create local scope
+  {
    int j          = 0;
-  recv_list[j++] = new_external_processor[0];
+    recv_list[j++] = external_processor[0];

    for (int i = 1; i < num_external; i++) {
-    if (new_external_processor[i - 1] != new_external_processor[i]) {
-      recv_list[j++] = new_external_processor[i];
+      if (external_processor[i - 1] != external_processor[i]) {
+        recv_list[j++] = external_processor[i];
+      }
    }
  }

  // Ensure that all the neighbors we expect to receive from also send to us
-  // Send a 0 length message to each of our recv neighbors
  int send_list[num_send_neighbors];

-  for (int i = 0; i < num_send_neighbors; i++) {
-    send_list[i] = 0;
-  }
-
-  //  first post receives, these are immediate receives
-  //  Do not wait for result to come, will do that at the
-  //  wait call below.
-  int MPI_MY_TAG = 99;
-
-  MPI_Request request[MAX_NUM_MESSAGES];
-
-  for (int i = 0; i < num_send_neighbors; i++) {
-    MPI_Irecv(tmp_buffer + i,
-        1,
-        MPI_INT,
-        MPI_ANY_SOURCE,
-        MPI_MY_TAG,
-        MPI_COMM_WORLD,
-        request + i);
-  }
-
-  // send messages
-  for (int i = 0; i < num_recv_neighbors; i++) {
-    MPI_Send(tmp_buffer + i,
-        1,
-        MPI_INT,
-        recv_list[i],
-        MPI_MY_TAG,
-        MPI_COMM_WORLD);
-  }
-
-  // Receive message from each send neighbor to construct 'send_list'.
-  MPI_Status status;
-  for (int i = 0; i < num_send_neighbors; i++) {
-    if (MPI_Wait(request + i, &status)) {
-      printf("MPI_Wait error\n");
-      exit(EXIT_FAILURE);
-    }
-    send_list[i] = status.MPI_SOURCE;
-  }
+  probeNeighbors(send_list, num_send_neighbors, recv_list, num_recv_neighbors);

  /*  Compare the two lists. In most cases they should be the same.
  //  However, if they are not then add new entries to the recv list
  //  that are in the send list (but not already in the recv list).
-  WHY!! This ensures that the sendlist is equal to the sendlist
+  FIXME: WHY!! This ensures that the recv_list is equal to the sendlist
  But why is this required? -> Just One neighbour list??*/
  for (int j = 0; j < num_send_neighbors; j++) {
    int found = 0;
@@ -343,31 +480,25 @@ void commPartition(Comm* c, Matrix* A)
          rank,
          size,
          num_recv_neighbors,
-          send_list[i]);
+          send_list[j]);
 #endif
      recv_list[num_recv_neighbors] = send_list[j];
-      (num_recv_neighbors)++;
+      num_recv_neighbors++;
    }
  }
-  num_send_neighbors = num_recv_neighbors;

-  if (num_send_neighbors > MAX_NUM_MESSAGES) {
+  // From here on only have one neighbor list for both send and recv
+  c->numNeighbors = num_recv_neighbors;
+  for (int i = 0; i < c->numNeighbors; i++) {
+    c->neighbors[i] = recv_list[i];
+  }
+
+  if (c->numNeighbors > MAX_NUM_MESSAGES) {
    printf("Must increase MAX_EXTERNAL\n");
+    MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
    exit(EXIT_FAILURE);
  }

-  // Start filling communication setup
-  // Create 'new_external' which explicitly put the external elements in the
-  // order given by 'external_local_index'
-  c->total_to_be_sent   = total_to_be_sent;
-  int* elements_to_send = (int*)allocate(ARRAY_ALIGNMENT,
-      total_to_be_sent * sizeof(int));
-  c->elements_to_send   = elements_to_send;
-
-  for (int i = 0; i < total_to_be_sent; i++) {
-    elements_to_send[i] = 0;
-  }
-
  // Create 'new_external' which explicitly put the external elements in the
  // order given by 'external_local_index'
  int* new_external = (int*)allocate(ARRAY_ALIGNMENT,
@@ -377,195 +508,100 @@ void commPartition(Comm* c, Matrix* A)
    new_external[external_local_index[i] - local_nrow] = external_index[i];
  }

+  free(external_local_index);
+  free(external_index);
+  c->numExternal = num_external;
+
+  buildNeighbors(c, external_processor);
+
  // Send each processor the global index list of the external elements in the
  // order that I will want to receive them when updating my external elements
-  int lengths[num_recv_neighbors];
-  MPI_MY_TAG++;
+  // Build "elementsToSend" list. These are the x elements the current rank
+  // owns that need to be sent to other ranks
+  buildElementsToSend(c, A->startRow, external_processor, new_external);

-  // First post receives
-  for (int i = 0; i < num_recv_neighbors; i++) {
-    int partner = recv_list[i];
-    MPI_Irecv(lengths + i,
-        1,
-        MPI_INT,
-        partner,
-        MPI_MY_TAG,
-        MPI_COMM_WORLD,
-        request + i);
-  }
-
-  int* neighbors   = c->neighbors;
-  int* recv_length = c->recv_length;
-  int* send_length = c->send_length;
-
-  j = 0;
-
-  for (int i = 0; i < num_recv_neighbors; i++) {
-    int start     = j;
-    int newlength = 0;
-
-    // go through list of external elements until updating
-    // processor changes
-    while ((j < num_external) && (new_external_processor[j] == recv_list[i])) {
-      newlength++;
-      j++;
-      if (j == num_external) break;
-    }
-
-    recv_length[i] = newlength;
-    neighbors[i]   = recv_list[i];
-
-    length = j - start;
-    MPI_Send(&length, 1, MPI_INT, recv_list[i], MPI_MY_TAG, MPI_COMM_WORLD);
-  }
-
-  // Complete the receives of the number of externals
-  for (int i = 0; i < num_recv_neighbors; i++) {
-    if (MPI_Wait(request + i, &status)) {
-      printf("MPI_Wait error\n");
-      exit(EXIT_FAILURE);
-    }
-    send_length[i] = lengths[i];
-  }
-
-  // Build "elements_to_send" list.  These are the x elements I own
-  // that need to be sent to other processors.
-  MPI_MY_TAG++;
-
-  j = 0;
-
-  for (int i = 0; i < num_recv_neighbors; i++) {
-    MPI_Irecv(elements_to_send + j,
-        send_length[i],
-        MPI_INT,
-        neighbors[i],
-        MPI_MY_TAG,
-        MPI_COMM_WORLD,
-        request + i);
-    j += send_length[i];
-  }
-
-  j = 0;
-
-  for (int i = 0; i < num_recv_neighbors; i++) {
-    int start     = j;
-    int newlength = 0;
-
-    // Go through list of external elements
-    // until updating processor changes.  This is redundant, but
-    // saves us from recording this information.
-    while ((j < num_external) && (new_external_processor[j] == recv_list[i])) {
-
-      newlength++;
-      j++;
-      if (j == num_external) break;
-    }
-    MPI_Send(new_external + start,
-        j - start,
-        MPI_INT,
-        recv_list[i],
-        MPI_MY_TAG,
-        MPI_COMM_WORLD);
-  }
-
-  // receive from each neighbor the global index list of external elements
-  for (int i = 0; i < num_recv_neighbors; i++) {
-    if (MPI_Wait(request + i, &status)) {
-      printf("MPI_Wait error\n");
-      exit(EXIT_FAILURE);
-    }
-  }
-
-  /// replace global indices by local indices
-  for (int i = 0; i < total_to_be_sent; i++) {
-    elements_to_send[i] -= start_row;
-  }
-
-  // Finish up !!
-  c->num_send_neighbors = num_send_neighbors;
  A->nc          = A->nc + num_external;
-
-  // Used in exchange
-  CG_FLOAT* send_buffer = (CG_FLOAT*)allocate(ARRAY_ALIGNMENT,
-      total_to_be_sent * sizeof(CG_FLOAT));
-  c->send_buffer        = send_buffer;
+  c->send_buffer = (CG_FLOAT*)allocate(ARRAY_ALIGNMENT,
+      c->totalSendCount * sizeof(CG_FLOAT));

  free(recv_list);
  free(new_external);
 #endif
 }

-void commExchange(Comm* c, Matrix* A, double* x)
+void commExchange(Comm* c, Matrix* A, CG_FLOAT* x)
 {
 #ifdef _MPI
-  int num_external = 0;
-
-  // Extract Matrix pieces
-
-  int local_nrow        = A->nr;
-  int num_neighbors     = c->num_send_neighbors;
-  int* recv_length      = c->recv_length;
-  int* send_length      = c->send_length;
+  int numNeighbors     = c->numNeighbors;
  int* neighbors       = c->neighbors;
-  double* send_buffer   = c->send_buffer;
-  int total_to_be_sent  = c->total_to_be_sent;
-  int* elements_to_send = c->elements_to_send;
+  int* recvCount       = c->recvCount;
+  int* sendCount       = c->sendCount;
+  CG_FLOAT* sendBuffer = c->send_buffer;
+  int* elementsToSend  = c->elementsToSend;

-  int rank      = c->rank;
-  int size      = c->size;
-  MPI_Comm comm = c->comm;
-
-  //  first post receives, these are immediate receives
-  //  Do not wait for result to come, will do that at the
-  //  wait call below.
  int MPI_MY_TAG = 99;
-
-  MPI_Request request[num_neighbors];
+  MPI_Request request[numNeighbors];

  // Externals are at end of locals
-  double* x_external = (double*)x + local_nrow;
+  CG_FLOAT* externals = x + A->nr;

-  // Post receives first
-  for (int i = 0; i < num_neighbors; i++) {
-    int n_recv = recv_length[i];
-    MPI_Irecv(x_external,
-        n_recv,
-        MPI_DOUBLE,
+  // Post receives
+  for (int i = 0; i < numNeighbors; i++) {
+    int count = recvCount[i];
+
+    MPI_Irecv(externals,
+        count,
+        MPI_FLOAT_TYPE,
        neighbors[i],
        MPI_MY_TAG,
        MPI_COMM_WORLD,
        request + i);
-    x_external += n_recv;
+
+    externals += count;
  }

-  // Fill up send buffer
-  for (int i = 0; i < total_to_be_sent; i++) {
-    send_buffer[i] = x[elements_to_send[i]];
+  // Copy values for all ranks into send buffer
+  for (int i = 0; i < c->totalSendCount; i++) {
+    sendBuffer[i] = x[elementsToSend[i]];
  }

  // Send to each neighbor
-  for (int i = 0; i < num_neighbors; i++) {
-    int n_send = send_length[i];
-    MPI_Send(send_buffer,
-        n_send,
-        MPI_DOUBLE,
+  for (int i = 0; i < numNeighbors; i++) {
+    int count = sendCount[i];
+
+    MPI_Send(sendBuffer,
+        count,
+        MPI_FLOAT_TYPE,
        neighbors[i],
        MPI_MY_TAG,
        MPI_COMM_WORLD);
-    send_buffer += n_send;
+
+    sendBuffer += count;
  }

-  // Complete the reads issued above
+  // Complete the receives issued above
  MPI_Status status;
-  for (int i = 0; i < num_neighbors; i++) {
+  for (int i = 0; i < numNeighbors; i++) {
    if (MPI_Wait(request + i, &status)) {
      printf("MPI_Wait error\n");
+      MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
      exit(EXIT_FAILURE);
    }
  }
 #endif
 }

+void commReduction(double* v, int op)
+{
+#ifdef _MPI
+  if (op == MAX) {
+    MPI_Allreduce(MPI_IN_PLACE, v, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+  } else if (op == SUM) {
+    MPI_Allreduce(MPI_IN_PLACE, v, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+  }
+#endif
+}
+
 void commPrintConfig(Comm* c)
 {
 #ifdef _MPI
@@ -577,6 +613,21 @@ void commPrintConfig(Comm* c)

  for (int i = 0; i < c->size; i++) {
    if (i == c->rank) {
+      printf("Rank %d has %d neighbors with %d externals:\n",
+          c->rank,
+          c->numNeighbors,
+          c->numExternal);
+      for (int j = 0; j < c->numNeighbors; j++) {
+        printf("\t%d: receive %d send %d\n",
+            c->neighbors[j],
+            c->recvCount[j],
+            c->sendCount[j]);
+      }
+      printf("\tSend %d elements: [", c->totalSendCount);
+      for (int j = 0; j < c->totalSendCount; j++) {
+        printf("%d, ", c->elementsToSend[j]);
+      }
+      printf("]\n");
      fflush(stdout);
    }
    MPI_Barrier(MPI_COMM_WORLD);
@@ -594,14 +645,14 @@ void commMatrixDump(Comm* c, Matrix* m)
  CG_FLOAT* val   = m->val;

  if (commIsMaster(c)) {
-    printf("Matrix: %lld total non zeroes, total number of rows %lld\n",
+    printf("Matrix: %d total non zeroes, total number of rows %d\n",
        m->totalNnz,
        m->totalNr);
  }

  for (int i = 0; i < size; i++) {
    if (i == rank) {
-      printf("Rank %d: %lld non zeroes, number of rows %lld\n",
+      printf("Rank %d: %d non zeroes, number of rows %d\n",
          rank,
          m->nnz,
          numRows);
@@ -609,9 +660,9 @@ void commMatrixDump(Comm* c, Matrix* m)
      for (int rowID = 0; rowID < numRows; rowID++) {
        printf("Row [%d]: ", rowID);

-        for (size_t rowEntry = rowPtr[rowID]; rowEntry < rowPtr[rowID + 1];
+        for (int rowEntry = rowPtr[rowID]; rowEntry < rowPtr[rowID + 1];
            rowEntry++) {
-          printf("[%lld]:%.2f ", colInd[rowEntry], val[rowEntry]);
+          printf("[%d]:%.2f ", colInd[rowEntry], val[rowEntry]);
        }

        printf("\n");
--- a/src/comm.h
+++ b/src/comm.h
@@ -4,6 +4,7 @@
 * license that can be found in the LICENSE file. */
 #ifndef __COMM_H_
 #define __COMM_H_
+#include "util.h"
 #if defined(_MPI)
 #include <mpi.h>
 #endif
@@ -20,18 +21,14 @@ typedef struct {
  int rank;
  int size;
 #if defined(_MPI)
-  MPI_Comm comm;
-
-  int num_external;
-  int num_send_neighbors;
-  int* external_index;
-  int* external_local_index;
-  int total_to_be_sent;
-  int* elements_to_send;
+  int numNeighbors;
+  int numExternal;
+  int totalSendCount;
+  int* elementsToSend;
  int neighbors[MAX_NUM_NEIGHBOURS];
-  int recv_length[MAX_NUM_NEIGHBOURS];
-  int send_length[MAX_NUM_NEIGHBOURS];
-  double* send_buffer;
+  int recvCount[MAX_NUM_NEIGHBOURS];
+  int sendCount[MAX_NUM_NEIGHBOURS];
+  CG_FLOAT* send_buffer;
 #endif
 } Comm;

--- a/src/main.c
+++ b/src/main.c
@@ -55,9 +55,9 @@ int main(int argc, char** argv)
  CG_FLOAT eps = (CG_FLOAT)param.eps;
  int itermax  = param.itermax;
  initSolver(&s, &comm, &param);
-  commMatrixDump(&comm, &s.A);
-  commFinalize(&comm);
-  exit(EXIT_SUCCESS);
+  // commMatrixDump(&comm, &s.A);
+  commPartition(&comm, &s.A);
+  commPrintConfig(&comm);

  CG_UINT nrow = s.A.nr;
  CG_UINT ncol = s.A.nc;
--- a/src/solver.c
+++ b/src/solver.c
@@ -109,7 +109,7 @@ static void matrixGenerate(
 #ifdef VERBOSE
  printf("Process %d of %d has %d rows\n", rank, size, local_nrow);
  printf("Global rows %d through %d\n", start_row, stop_row);
-  printf("%d nonzeros\n", start_row, stop_row);
+  printf("%d nonzeros\n", local_nnz);
 #endif /* ifdef VERBOSE */

  s->A.startRow = start_row;
@@ -139,11 +139,11 @@ void spMVM(Matrix* m, const CG_FLOAT* restrict x, CG_FLOAT* restrict y)
  CG_UINT* colInd = m->colInd;
  CG_FLOAT* val   = m->val;

-  for (size_t rowID = 0; rowID < numRows; rowID++) {
+  for (int rowID = 0; rowID < numRows; rowID++) {
    CG_FLOAT tmp = y[rowID];

    // loop over all elements in row
-    for (size_t rowEntry = rowPtr[rowID]; rowEntry < rowPtr[rowID + 1];
+    for (int rowEntry = rowPtr[rowID]; rowEntry < rowPtr[rowID + 1];
        rowEntry++) {
      tmp += val[rowEntry] * x[colInd[rowEntry]];
    }
@@ -160,15 +160,15 @@ void waxpby(const CG_UINT n,
    CG_FLOAT* const w)
 {
  if (alpha == 1.0) {
-    for (size_t i = 0; i < n; i++) {
+    for (int i = 0; i < n; i++) {
      w[i] = x[i] + beta * y[i];
    }
  } else if (beta == 1.0) {
-    for (size_t i = 0; i < n; i++) {
+    for (int i = 0; i < n; i++) {
      w[i] = alpha * x[i] + y[i];
    }
  } else {
-    for (size_t i = 0; i < n; i++) {
+    for (int i = 0; i < n; i++) {
      w[i] = alpha * x[i] + beta * y[i];
    }
  }
@@ -182,11 +182,11 @@ void ddot(const CG_UINT n,
  CG_FLOAT sum = 0.0;

  if (y == x) {
-    for (size_t i = 0; i < n; i++) {
+    for (int i = 0; i < n; i++) {
      sum += x[i] * x[i];
    }
  } else {
-    for (size_t i = 0; i < n; i++) {
+    for (int i = 0; i < n; i++) {
      sum += x[i] * y[i];
    }
  }
--- a/src/util.h
+++ b/src/util.h
@@ -23,13 +23,16 @@
 #define MAXLINE 4096
 #endif

-#define CG_UINT unsigned long long int
+// #define CG_UINT unsigned long long int
+#define CG_UINT int

 #if PRECISION == 1
 #define CG_FLOAT         float
+#define MPI_FLOAT_TYPE   MPI_FLOAT
 #define PRECISION_STRING "single"
 #else
 #define CG_FLOAT         double
+#define MPI_FLOAT_TYPE   MPI_DOUBLE
 #define PRECISION_STRING "double"
 #endif