Fix bugs and clean up. Add subroutines.

2025-01-20 18:24:37 +01:00
parent 6f1932cf4f
commit 49dc2d97ae
7 changed files with 396 additions and 345 deletions
--- a/hpcg.par
+++ b/hpcg.par
@@ -3,9 +3,9 @@
 #==============================================================================
 filename generate
-nx 50
+nx 10
-ny 50
+ny 10
-nz 50
+nz 10
 itermax 10000
 eps 0.0001
--- a/mk/include_CLANG.mk
+++ b/mk/include_CLANG.mk
@@ -17,5 +17,5 @@ VERSION  = --version
 CFLAGS   = -O3 -ffast-math -std=c99 $(OPENMP)
 #CFLAGS   = -Ofast -fnt-store=aggressive  -std=c99 $(OPENMP) #AMD CLANG
 LFLAGS   = $(OPENMP)
-DEFINES  += -D_GNU_SOURCE
+DEFINES  += -D_GNU_SOURCE# -DVERBOSE
 INCLUDES = -I/Users/jan/.local/include
--- a/src/comm.c
+++ b/src/comm.c
@@ -3,6 +3,7 @@
 * Use of this source code is governed by a MIT style
 * license that can be found in the LICENSE file. */
 #include "util.h"
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -19,15 +20,162 @@ static int sizeOfRank(int rank, int size, int N)
  return N / size + ((N % size > rank) ? 1 : 0);
 }
-void commReduction(double* v, int op)
+// Ensure that all the neighbors we expect to receive from also send to us
 // sendList - All ranks receive values from us
 // numSendNeighbors - Number of entries in sendList
 // recvList - All ranks that send values from us
 // numRecvNeighbors - Number of entries in recvList
 // FIXME: What if ranks want to send to us and are not in sendlist?
 static void probeNeighbors(
    int* sendList, int numSendNeighbors, int* recvList, int numRecvNeighbors)
 {
-#ifdef _MPI
+
-  if (op == MAX) {
+  int val;
-    MPI_Allreduce(MPI_IN_PLACE, v, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+
-  } else if (op == SUM) {
+  for (int i = 0; i < numSendNeighbors; i++) {
-    MPI_Allreduce(MPI_IN_PLACE, v, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    sendList[i] = -1;
  }
  int MPI_MY_TAG = 99;
  MPI_Request request[MAX_NUM_MESSAGES];
  for (int i = 0; i < numSendNeighbors; i++) {
    MPI_Irecv(&val,
        1,
        MPI_INT,
        MPI_ANY_SOURCE,
        MPI_MY_TAG,
        MPI_COMM_WORLD,
        request + i);
  }
  for (int i = 0; i < numRecvNeighbors; i++) {
    MPI_Send(&val, 1, MPI_INT, recvList[i], MPI_MY_TAG, MPI_COMM_WORLD);
  }
  // Receive message from each send neighbor to construct 'sendList'.
  MPI_Status status;
  for (int i = 0; i < numSendNeighbors; i++) {
    if (MPI_Wait(request + i, &status)) {
      printf("MPI_Wait error\n");
      MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
      exit(EXIT_FAILURE);
    }
    sendList[i] = status.MPI_SOURCE;
  }
 }
 static void buildNeighbors(Comm* c, int* external_processor)
 {
  int numNeighbors = c->numNeighbors;
  int numExternal  = c->numExternal;
  int* neighbors   = c->neighbors;
  int lengths[numNeighbors];
  int MPI_MY_TAG = 100;
  MPI_Request request[MAX_NUM_MESSAGES];
  // First post receives
  for (int i = 0; i < numNeighbors; i++) {
    MPI_Irecv(lengths + i,
        1,
        MPI_INT,
        neighbors[i],
        MPI_MY_TAG,
        MPI_COMM_WORLD,
        request + i);
  }
  int* recvCount = c->recvCount;
  int* sendCount = c->sendCount;
  int j = 0;
  for (int i = 0; i < numNeighbors; i++) {
    int count = 0;
    // go through list of external elements until updating
    // processor changes
    while ((j < numExternal) && (external_processor[j] == neighbors[i])) {
      count++;
      j++;
      if (j == numExternal) break;
    }
    recvCount[i] = count;
    MPI_Send(&count, 1, MPI_INT, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD);
  }
  MPI_Status status;
  // Complete the receives of the number of externals
  for (int i = 0; i < numNeighbors; i++) {
    if (MPI_Wait(request + i, &status)) {
      printf("MPI_Wait error\n");
      exit(EXIT_FAILURE);
    }
    sendCount[i] = lengths[i];
  }
 }
 static void buildElementsToSend(
    Comm* c, int startRow, int* external_processor, int* new_external)
 {
  int numNeighbors = c->numNeighbors;
  int numExternal  = c->numExternal;
  int* neighbors   = c->neighbors;
  int MPI_MY_TAG   = 100;
  MPI_Request request[MAX_NUM_MESSAGES];
  c->elementsToSend   = (int*)allocate(ARRAY_ALIGNMENT,
      c->totalSendCount * sizeof(int));
  int* elementsToSend = c->elementsToSend;
  int j = 0;
  for (int i = 0; i < numNeighbors; i++) {
    MPI_Irecv(elementsToSend + j,
        c->sendCount[i],
        MPI_INT,
        neighbors[i],
        MPI_MY_TAG,
        MPI_COMM_WORLD,
        request + i);
    j += c->sendCount[i];
  }
  j = 0;
  for (int i = 0; i < numNeighbors; i++) {
    int start = j;
    // Go through list of external elements
    // until updating processor changes.  This is redundant, but
    // saves us from recording this information.
    while ((j < numExternal) && (external_processor[j] == neighbors[i])) {
      j++;
      if (j == numExternal) break;
    }
    MPI_Send(new_external + start,
        j - start,
        MPI_INT,
        neighbors[i],
        MPI_MY_TAG,
        MPI_COMM_WORLD);
  }
  MPI_Status status;
  // receive from each neighbor the global index list of external elements
  for (int i = 0; i < numNeighbors; i++) {
    if (MPI_Wait(request + i, &status)) {
      printf("MPI_Wait error\n");
      exit(EXIT_FAILURE);
    }
  }
  /// replace global indices by local indices
  for (int i = 0; i < c->totalSendCount; i++) {
    elementsToSend[i] -= startRow;
  }
 #endif
 }
 void commPartition(Comm* c, Matrix* A)
@@ -35,17 +183,16 @@ void commPartition(Comm* c, Matrix* A)
 #ifdef _MPI
  int rank = c->rank;
  int size = c->size;
  MPI_Comm comm = c->comm;
  // Extract Matrix pieces
-  int start_row       = A->startRow;
+  CG_UINT start_row  = A->startRow;
-  int stop_row        = A->stopRow;
+  CG_UINT stop_row   = A->stopRow;
-  int total_nrow      = A->totalNr;
+  CG_UINT total_nrow = A->totalNr;
-  long long total_nnz = A->totalNnz;
+  CG_UINT total_nnz  = A->totalNnz;
-  int local_nrow      = A->nr;
+  CG_UINT local_nrow = A->nr;
-  int local_nnz       = A->nnz;
+  CG_UINT local_nnz  = A->nnz;
-  int* row_ptr        = (int*)A->rowPtr;
+  CG_UINT* row_ptr   = A->rowPtr;
-  int* col_ind        = (int*)A->colInd;
+  CG_UINT* col_ind   = A->colInd;
  // We need to convert the index values for the rows on this processor
  // to a local index space. We need to:
@@ -58,48 +205,54 @@ void commPartition(Comm* c, Matrix* A)
  //     - find out which processor owns the value.
  //     - Set up communication for sparse MV operation.
-  // Scan the indices and transform to local
+  // FIXME: Use a lookup table with size total number of rows. For lower memory
-  int* externals   = (int*)allocate(ARRAY_ALIGNMENT, A->totalNr * sizeof(int));
+  // consumption a map would be better choice.
-  int num_external = 1;
+  int* externals   = (int*)allocate(ARRAY_ALIGNMENT, total_nrow * sizeof(int));
  int num_external = 0; // local number of external indices
  // column indices that are not processed yet are marked with -1
  for (int i = 0; i < total_nrow; i++) {
    externals[i] = -1;
  }
  int* external_index = (int*)allocate(ARRAY_ALIGNMENT,
      MAX_EXTERNAL * sizeof(int));
  c->external_index   = external_index;
  for (int i = 0; i < A->totalNr; i++) {
    externals[i] = -1;
  }
  for (int i = 0; i < local_nrow; i++) {
    for (int j = row_ptr[i]; j < row_ptr[i + 1]; j++) {
      int cur_ind = A->colInd[j];
 #ifdef VERBOSE
-      printf("Process %d of %d getting index %d in local row %d\n",
+      printf("Rank %d of %d getting entry %d:index %d in local row %d\n",
          rank,
          size,
          j,
          cur_ind,
          i);
 #endif
-      // shift local rows to the start
+      // convert local column references to local numbering
      if (start_row <= cur_ind && cur_ind <= stop_row) {
        col_ind[j] -= start_row;
      } else {
-        // Must find out if we have already set up this point
+        // find out if we have already set up this point
        if (externals[cur_ind] == -1) {
-          externals[cur_ind] = num_external++;
+          externals[cur_ind] = num_external;
          if (num_external <= MAX_EXTERNAL) {
-            external_index[num_external - 1] = cur_ind;
+            external_index[num_external] = cur_ind;
-            // Mark index as external by negating it
+            // mark in local column index as external by negating it
-            col_ind[j] = -(col_ind[j] + 1); // FIXME: Offset?
+            // col_ind[j] = -(col_ind[j] + 1); // FIXME: Offset?
            col_ind[j] = -col_ind[j];
          } else {
            printf("Must increase MAX_EXTERNAL\n");
            MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
            exit(EXIT_FAILURE);
          }
          num_external++;
        } else {
-          // Mark index as external by negating it
+          // Mark index as external by adding 1 and negating it
          // col_ind[j] = -(col_ind[j] + 1); // FIXME: Offset?
          col_ind[j] = -col_ind[j];
        }
      }
@@ -109,44 +262,42 @@ void commPartition(Comm* c, Matrix* A)
  /**************************************************************************
  Go through list of externals to find out which processors must be accessed.
  **************************************************************************/
  c->num_external = num_external;
  int tmp_buffer[size];
  int global_index_offsets[size];
  for (int i = 0; i < size; i++) {
    tmp_buffer[i] = 0;
  }
  tmp_buffer[rank] = start_row;
  MPI_Allreduce(tmp_buffer,
      global_index_offsets,
      size,
      MPI_INT,
      MPI_SUM,
      MPI_COMM_WORLD);
  // Go through list of externals and find the processor that owns each
  int external_processor[num_external];
  {
    int globalIndexOffsets[size];
    MPI_Allgather(&start_row,
        1,
        MPI_INT,
        globalIndexOffsets,
        1,
        MPI_INT,
        MPI_COMM_WORLD);
    // for (int i = 0; i < size; i++) {
    //   printf("Rank %d: i = %d: OFFSET %d\n", rank, i, globalIndexOffsets[i]);
    // }
    // Go through list of externals and find the processor that owns each
    for (int i = 0; i < num_external; i++) {
-    int cur_ind = external_index[i];
+      int globalIndex = external_index[i];
      for (int j = size - 1; j >= 0; j--) {
-      if (global_index_offsets[j] <= cur_ind) {
+        if (globalIndexOffsets[j] <= globalIndex) {
          external_processor[i] = j;
          break;
        }
      }
    }
-
+  }
  /*Go through the external elements. For each newly encountered external
  point assign it the next index in the local sequence. Then look for other
-  external elements who are updated by the same node and assign them the next
+  external elements who are updated by the same rank and assign them the next
  set of index numbers in the local sequence (ie. elements updated by the same
-  node have consecutive indices).*/
+  rank have consecutive indices).*/
  int* external_local_index = (int*)allocate(ARRAY_ALIGNMENT,
-      MAX_EXTERNAL * sizeof(int));
+      num_external * sizeof(int));
  c->external_local_index   = external_local_index;
  int count = local_nrow;
@@ -172,27 +323,39 @@ void commPartition(Comm* c, Matrix* A)
  for (int i = 0; i < local_nrow; i++) {
    for (int j = rowPtr[i]; j < rowPtr[i + 1]; j++) {
      if (col_ind[j] < 0) {
-        int cur_ind = -col_ind[j] - 1; // FIXME: Offset by 1??
+        // size_t cur_ind = -(col_ind[j] - 1); // FIXME: Offset by 1??
        int cur_ind = -col_ind[j];
        col_ind[j]  = external_local_index[externals[cur_ind]];
      }
    }
  }
-  int new_external_processor[num_external];
+  free(externals);
-
+  // int new_external_processor[num_external];
-  for (int i = 0; i < num_external; i++) {
+  //
-    new_external_processor[i] = 0;
+  // for (int i = 0; i < num_external; i++) {
-  }
+  //   new_external_processor[i] = -1;
-
+  // }
-  // setup map from external id to partition
+  //
-  for (int i = 0; i < num_external; i++) {
+  // // setup map from external id to partition
-    new_external_processor[external_local_index[i] - local_nrow] =
+  // for (int i = 0; i < num_external; i++) {
-        external_processor[i];
+  //   int id                     = external_local_index[i] - local_nrow;
-  }
+  //   new_external_processor[id] = external_processor[i];
-
+  //   printf("Rank %d of %d: %d new_external_processor[%d] = %d\n",
  //       rank,
  //       size,
  //       i,
  //       id,
  //       external_processor[i]);
  // }
  // commFinalize(c);
  // exit(EXIT_SUCCESS);
  //
 #ifdef VERBOSE
  printf("Rank %d of %d: %d externals\n", rank, size, num_external);
  for (int i = 0; i < num_external; i++) {
-    printf("Process %d of %d: external process[%d] = %d\n",
+    printf("Rank %d of %d: external[%d] owned by %d\n",
        rank,
        size,
        i,
@@ -217,119 +380,93 @@ void commPartition(Comm* c, Matrix* A)
  int num_recv_neighbors = 0;
  int length             = 1;
  // Encoding both number of ranks that need values from this rank and the total
  // number of values by adding one for any additional rank and adding size for
  // every additional value.
  for (int i = 0; i < num_external; i++) {
-    if (tmp_neighbors[new_external_processor[i]] == 0) {
+    if (tmp_neighbors[external_processor[i]] == 0) {
      num_recv_neighbors++;
-      tmp_neighbors[new_external_processor[i]] = 1;
+      tmp_neighbors[external_processor[i]] = 1;
    }
-    tmp_neighbors[new_external_processor[i]] += size;
+    tmp_neighbors[external_processor[i]] += size;
  }
  // sum over all processors all the tmp_neighbors arrays
-  MPI_Allreduce(tmp_neighbors,
+  MPI_Allreduce(MPI_IN_PLACE,
-      tmp_buffer,
+      tmp_neighbors,
      size,
      MPI_INT,
      MPI_SUM,
      MPI_COMM_WORLD);
-  /* decode the combined 'tmp_neighbors' (stored in tmp_buffer) array from all
+  /* decode the combined 'tmp_neighbors'  array from all ranks */
-   * the processors */
+  // Number of ranks that receive values from us
-  int num_send_neighbors = tmp_buffer[rank] % size;
+  int num_send_neighbors = tmp_neighbors[rank] % size;
-  /* decode 'tmp_buffer[rank] to deduce total number of elements we must send */
+  /* decode 'tmp_neighbors[rank] to deduce total number of elements we must send
-  int total_to_be_sent = (tmp_buffer[rank] - num_send_neighbors) / size;
+   */
  c->totalSendCount = (tmp_neighbors[rank] - num_send_neighbors) / size;
-  /* Check to see if we have enough workspace allocated.  This could be
+  /* Check to see if we have enough memory allocated.  This could be
   dynamically modified, but let's keep it simple for now...*/
  if (num_send_neighbors > MAX_NUM_MESSAGES) {
    printf("Must increase MAX_NUM_MESSAGES. Must be at least %d\n",
        num_send_neighbors);
    MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
    exit(EXIT_FAILURE);
  }
-  if (total_to_be_sent > MAX_EXTERNAL) {
+  if (c->totalSendCount > MAX_EXTERNAL) {
    printf("Must increase MAX_EXTERNAL. Must be at least %d\n",
-        total_to_be_sent);
+        c->totalSendCount);
    MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
    exit(EXIT_FAILURE);
  }
 #ifdef VERBOSE
-  cout << "Processor " << rank << " of " << size
+  printf("Rank %d of %d: tmp_neighbors = %d\n",
-       << ": Number of send neighbors = " << num_send_neighbors << endl;
+      rank,
-
+      size,
-  cout << "Processor " << rank << " of " << size
+      tmp_neighbors[rank]);
-       << ": Number of receive neighbors = " << num_recv_neighbors << endl;
+  printf("Rank %d of %d: Number of send neighbors = %d\n",
-
+      rank,
-  cout << "Processor " << rank << " of " << size
+      size,
-       << ": Total number of elements to send = " << total_to_be_sent << endl;
+      num_send_neighbors);
-
+  printf("Rank %d of %d: Number of receive neighbors = %d\n",
      rank,
      size,
      num_recv_neighbors);
  printf("Rank %d of %d: Total number of elements to send = %d\n",
      rank,
      size,
      c->totalSendCount);
  MPI_Barrier(MPI_COMM_WORLD);
 #endif
  /* Make a list of the neighbors that will send information to update our
   external elements (in the order that we will receive this information).*/
-  int* recv_list = allocate(ARRAY_ALIGNMENT, MAX_EXTERNAL * sizeof(int));
+  int* recv_list = allocate(ARRAY_ALIGNMENT, MAX_NUM_MESSAGES * sizeof(int));
-  // FIXME: Create local scope
+  {
    int j          = 0;
-  recv_list[j++] = new_external_processor[0];
+    recv_list[j++] = external_processor[0];
    for (int i = 1; i < num_external; i++) {
-    if (new_external_processor[i - 1] != new_external_processor[i]) {
+      if (external_processor[i - 1] != external_processor[i]) {
-      recv_list[j++] = new_external_processor[i];
+        recv_list[j++] = external_processor[i];
      }
    }
  }
  // Ensure that all the neighbors we expect to receive from also send to us
  // Send a 0 length message to each of our recv neighbors
  int send_list[num_send_neighbors];
-  for (int i = 0; i < num_send_neighbors; i++) {
+  probeNeighbors(send_list, num_send_neighbors, recv_list, num_recv_neighbors);
    send_list[i] = 0;
  }
  //  first post receives, these are immediate receives
  //  Do not wait for result to come, will do that at the
  //  wait call below.
  int MPI_MY_TAG = 99;
  MPI_Request request[MAX_NUM_MESSAGES];
  for (int i = 0; i < num_send_neighbors; i++) {
    MPI_Irecv(tmp_buffer + i,
        1,
        MPI_INT,
        MPI_ANY_SOURCE,
        MPI_MY_TAG,
        MPI_COMM_WORLD,
        request + i);
  }
  // send messages
  for (int i = 0; i < num_recv_neighbors; i++) {
    MPI_Send(tmp_buffer + i,
        1,
        MPI_INT,
        recv_list[i],
        MPI_MY_TAG,
        MPI_COMM_WORLD);
  }
  // Receive message from each send neighbor to construct 'send_list'.
  MPI_Status status;
  for (int i = 0; i < num_send_neighbors; i++) {
    if (MPI_Wait(request + i, &status)) {
      printf("MPI_Wait error\n");
      exit(EXIT_FAILURE);
    }
    send_list[i] = status.MPI_SOURCE;
  }
  /*  Compare the two lists. In most cases they should be the same.
  //  However, if they are not then add new entries to the recv list
  //  that are in the send list (but not already in the recv list).
-  WHY!! This ensures that the sendlist is equal to the sendlist
+  FIXME: WHY!! This ensures that the recv_list is equal to the sendlist
  But why is this required? -> Just One neighbour list??*/
  for (int j = 0; j < num_send_neighbors; j++) {
    int found = 0;
@@ -343,31 +480,25 @@ void commPartition(Comm* c, Matrix* A)
          rank,
          size,
          num_recv_neighbors,
-          send_list[i]);
+          send_list[j]);
 #endif
      recv_list[num_recv_neighbors] = send_list[j];
-      (num_recv_neighbors)++;
+      num_recv_neighbors++;
    }
  }
  num_send_neighbors = num_recv_neighbors;
-  if (num_send_neighbors > MAX_NUM_MESSAGES) {
+  // From here on only have one neighbor list for both send and recv
  c->numNeighbors = num_recv_neighbors;
  for (int i = 0; i < c->numNeighbors; i++) {
    c->neighbors[i] = recv_list[i];
  }
  if (c->numNeighbors > MAX_NUM_MESSAGES) {
    printf("Must increase MAX_EXTERNAL\n");
    MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
    exit(EXIT_FAILURE);
  }
  // Start filling communication setup
  // Create 'new_external' which explicitly put the external elements in the
  // order given by 'external_local_index'
  c->total_to_be_sent   = total_to_be_sent;
  int* elements_to_send = (int*)allocate(ARRAY_ALIGNMENT,
      total_to_be_sent * sizeof(int));
  c->elements_to_send   = elements_to_send;
  for (int i = 0; i < total_to_be_sent; i++) {
    elements_to_send[i] = 0;
  }
  // Create 'new_external' which explicitly put the external elements in the
  // order given by 'external_local_index'
  int* new_external = (int*)allocate(ARRAY_ALIGNMENT,
@@ -377,195 +508,100 @@ void commPartition(Comm* c, Matrix* A)
    new_external[external_local_index[i] - local_nrow] = external_index[i];
  }
  free(external_local_index);
  free(external_index);
  c->numExternal = num_external;
  buildNeighbors(c, external_processor);
  // Send each processor the global index list of the external elements in the
  // order that I will want to receive them when updating my external elements
-  int lengths[num_recv_neighbors];
+  // Build "elementsToSend" list. These are the x elements the current rank
-  MPI_MY_TAG++;
+  // owns that need to be sent to other ranks
  buildElementsToSend(c, A->startRow, external_processor, new_external);
  // First post receives
  for (int i = 0; i < num_recv_neighbors; i++) {
    int partner = recv_list[i];
    MPI_Irecv(lengths + i,
        1,
        MPI_INT,
        partner,
        MPI_MY_TAG,
        MPI_COMM_WORLD,
        request + i);
  }
  int* neighbors   = c->neighbors;
  int* recv_length = c->recv_length;
  int* send_length = c->send_length;
  j = 0;
  for (int i = 0; i < num_recv_neighbors; i++) {
    int start     = j;
    int newlength = 0;
    // go through list of external elements until updating
    // processor changes
    while ((j < num_external) && (new_external_processor[j] == recv_list[i])) {
      newlength++;
      j++;
      if (j == num_external) break;
    }
    recv_length[i] = newlength;
    neighbors[i]   = recv_list[i];
    length = j - start;
    MPI_Send(&length, 1, MPI_INT, recv_list[i], MPI_MY_TAG, MPI_COMM_WORLD);
  }
  // Complete the receives of the number of externals
  for (int i = 0; i < num_recv_neighbors; i++) {
    if (MPI_Wait(request + i, &status)) {
      printf("MPI_Wait error\n");
      exit(EXIT_FAILURE);
    }
    send_length[i] = lengths[i];
  }
  // Build "elements_to_send" list.  These are the x elements I own
  // that need to be sent to other processors.
  MPI_MY_TAG++;
  j = 0;
  for (int i = 0; i < num_recv_neighbors; i++) {
    MPI_Irecv(elements_to_send + j,
        send_length[i],
        MPI_INT,
        neighbors[i],
        MPI_MY_TAG,
        MPI_COMM_WORLD,
        request + i);
    j += send_length[i];
  }
  j = 0;
  for (int i = 0; i < num_recv_neighbors; i++) {
    int start     = j;
    int newlength = 0;
    // Go through list of external elements
    // until updating processor changes.  This is redundant, but
    // saves us from recording this information.
    while ((j < num_external) && (new_external_processor[j] == recv_list[i])) {
      newlength++;
      j++;
      if (j == num_external) break;
    }
    MPI_Send(new_external + start,
        j - start,
        MPI_INT,
        recv_list[i],
        MPI_MY_TAG,
        MPI_COMM_WORLD);
  }
  // receive from each neighbor the global index list of external elements
  for (int i = 0; i < num_recv_neighbors; i++) {
    if (MPI_Wait(request + i, &status)) {
      printf("MPI_Wait error\n");
      exit(EXIT_FAILURE);
    }
  }
  /// replace global indices by local indices
  for (int i = 0; i < total_to_be_sent; i++) {
    elements_to_send[i] -= start_row;
  }
  // Finish up !!
  c->num_send_neighbors = num_send_neighbors;
  A->nc          = A->nc + num_external;
-
+  c->send_buffer = (CG_FLOAT*)allocate(ARRAY_ALIGNMENT,
-  // Used in exchange
+      c->totalSendCount * sizeof(CG_FLOAT));
  CG_FLOAT* send_buffer = (CG_FLOAT*)allocate(ARRAY_ALIGNMENT,
      total_to_be_sent * sizeof(CG_FLOAT));
  c->send_buffer        = send_buffer;
  free(recv_list);
  free(new_external);
 #endif
 }
-void commExchange(Comm* c, Matrix* A, double* x)
+void commExchange(Comm* c, Matrix* A, CG_FLOAT* x)
 {
 #ifdef _MPI
-  int num_external = 0;
+  int numNeighbors     = c->numNeighbors;
  // Extract Matrix pieces
  int local_nrow        = A->nr;
  int num_neighbors     = c->num_send_neighbors;
  int* recv_length      = c->recv_length;
  int* send_length      = c->send_length;
  int* neighbors       = c->neighbors;
-  double* send_buffer   = c->send_buffer;
+  int* recvCount       = c->recvCount;
-  int total_to_be_sent  = c->total_to_be_sent;
+  int* sendCount       = c->sendCount;
-  int* elements_to_send = c->elements_to_send;
+  CG_FLOAT* sendBuffer = c->send_buffer;
  int* elementsToSend  = c->elementsToSend;
  int rank      = c->rank;
  int size      = c->size;
  MPI_Comm comm = c->comm;
  //  first post receives, these are immediate receives
  //  Do not wait for result to come, will do that at the
  //  wait call below.
  int MPI_MY_TAG = 99;
-
+  MPI_Request request[numNeighbors];
  MPI_Request request[num_neighbors];
  // Externals are at end of locals
-  double* x_external = (double*)x + local_nrow;
+  CG_FLOAT* externals = x + A->nr;
-  // Post receives first
+  // Post receives
-  for (int i = 0; i < num_neighbors; i++) {
+  for (int i = 0; i < numNeighbors; i++) {
-    int n_recv = recv_length[i];
+    int count = recvCount[i];
-    MPI_Irecv(x_external,
+
-        n_recv,
+    MPI_Irecv(externals,
-        MPI_DOUBLE,
+        count,
        MPI_FLOAT_TYPE,
        neighbors[i],
        MPI_MY_TAG,
        MPI_COMM_WORLD,
        request + i);
-    x_external += n_recv;
+
    externals += count;
  }
-  // Fill up send buffer
+  // Copy values for all ranks into send buffer
-  for (int i = 0; i < total_to_be_sent; i++) {
+  for (int i = 0; i < c->totalSendCount; i++) {
-    send_buffer[i] = x[elements_to_send[i]];
+    sendBuffer[i] = x[elementsToSend[i]];
  }
  // Send to each neighbor
-  for (int i = 0; i < num_neighbors; i++) {
+  for (int i = 0; i < numNeighbors; i++) {
-    int n_send = send_length[i];
+    int count = sendCount[i];
-    MPI_Send(send_buffer,
+
-        n_send,
+    MPI_Send(sendBuffer,
-        MPI_DOUBLE,
+        count,
        MPI_FLOAT_TYPE,
        neighbors[i],
        MPI_MY_TAG,
        MPI_COMM_WORLD);
-    send_buffer += n_send;
+
    sendBuffer += count;
  }
-  // Complete the reads issued above
+  // Complete the receives issued above
  MPI_Status status;
-  for (int i = 0; i < num_neighbors; i++) {
+  for (int i = 0; i < numNeighbors; i++) {
    if (MPI_Wait(request + i, &status)) {
      printf("MPI_Wait error\n");
      MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
      exit(EXIT_FAILURE);
    }
  }
 #endif
 }
 void commReduction(double* v, int op)
 {
 #ifdef _MPI
  if (op == MAX) {
    MPI_Allreduce(MPI_IN_PLACE, v, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
  } else if (op == SUM) {
    MPI_Allreduce(MPI_IN_PLACE, v, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
  }
 #endif
 }
 void commPrintConfig(Comm* c)
 {
 #ifdef _MPI
@@ -577,6 +613,21 @@ void commPrintConfig(Comm* c)
  for (int i = 0; i < c->size; i++) {
    if (i == c->rank) {
      printf("Rank %d has %d neighbors with %d externals:\n",
          c->rank,
          c->numNeighbors,
          c->numExternal);
      for (int j = 0; j < c->numNeighbors; j++) {
        printf("\t%d: receive %d send %d\n",
            c->neighbors[j],
            c->recvCount[j],
            c->sendCount[j]);
      }
      printf("\tSend %d elements: [", c->totalSendCount);
      for (int j = 0; j < c->totalSendCount; j++) {
        printf("%d, ", c->elementsToSend[j]);
      }
      printf("]\n");
      fflush(stdout);
    }
    MPI_Barrier(MPI_COMM_WORLD);
@@ -594,14 +645,14 @@ void commMatrixDump(Comm* c, Matrix* m)
  CG_FLOAT* val   = m->val;
  if (commIsMaster(c)) {
-    printf("Matrix: %lld total non zeroes, total number of rows %lld\n",
+    printf("Matrix: %d total non zeroes, total number of rows %d\n",
        m->totalNnz,
        m->totalNr);
  }
  for (int i = 0; i < size; i++) {
    if (i == rank) {
-      printf("Rank %d: %lld non zeroes, number of rows %lld\n",
+      printf("Rank %d: %d non zeroes, number of rows %d\n",
          rank,
          m->nnz,
          numRows);
@@ -609,9 +660,9 @@ void commMatrixDump(Comm* c, Matrix* m)
      for (int rowID = 0; rowID < numRows; rowID++) {
        printf("Row [%d]: ", rowID);
-        for (size_t rowEntry = rowPtr[rowID]; rowEntry < rowPtr[rowID + 1];
+        for (int rowEntry = rowPtr[rowID]; rowEntry < rowPtr[rowID + 1];
            rowEntry++) {
-          printf("[%lld]:%.2f ", colInd[rowEntry], val[rowEntry]);
+          printf("[%d]:%.2f ", colInd[rowEntry], val[rowEntry]);
        }
        printf("\n");
--- a/src/comm.h
+++ b/src/comm.h
@@ -4,6 +4,7 @@
 * license that can be found in the LICENSE file. */
 #ifndef __COMM_H_
 #define __COMM_H_
 #include "util.h"
 #if defined(_MPI)
 #include <mpi.h>
 #endif
@@ -20,18 +21,14 @@ typedef struct {
  int rank;
  int size;
 #if defined(_MPI)
-  MPI_Comm comm;
+  int numNeighbors;
-
+  int numExternal;
-  int num_external;
+  int totalSendCount;
-  int num_send_neighbors;
+  int* elementsToSend;
  int* external_index;
  int* external_local_index;
  int total_to_be_sent;
  int* elements_to_send;
  int neighbors[MAX_NUM_NEIGHBOURS];
-  int recv_length[MAX_NUM_NEIGHBOURS];
+  int recvCount[MAX_NUM_NEIGHBOURS];
-  int send_length[MAX_NUM_NEIGHBOURS];
+  int sendCount[MAX_NUM_NEIGHBOURS];
-  double* send_buffer;
+  CG_FLOAT* send_buffer;
 #endif
 } Comm;
--- a/src/main.c
+++ b/src/main.c
@@ -55,9 +55,9 @@ int main(int argc, char** argv)
  CG_FLOAT eps = (CG_FLOAT)param.eps;
  int itermax  = param.itermax;
  initSolver(&s, &comm, &param);
-  commMatrixDump(&comm, &s.A);
+  // commMatrixDump(&comm, &s.A);
-  commFinalize(&comm);
+  commPartition(&comm, &s.A);
-  exit(EXIT_SUCCESS);
+  commPrintConfig(&comm);
  CG_UINT nrow = s.A.nr;
  CG_UINT ncol = s.A.nc;
--- a/src/solver.c
+++ b/src/solver.c
@@ -109,7 +109,7 @@ static void matrixGenerate(
 #ifdef VERBOSE
  printf("Process %d of %d has %d rows\n", rank, size, local_nrow);
  printf("Global rows %d through %d\n", start_row, stop_row);
-  printf("%d nonzeros\n", start_row, stop_row);
+  printf("%d nonzeros\n", local_nnz);
 #endif /* ifdef VERBOSE */
  s->A.startRow = start_row;
@@ -139,11 +139,11 @@ void spMVM(Matrix* m, const CG_FLOAT* restrict x, CG_FLOAT* restrict y)
  CG_UINT* colInd = m->colInd;
  CG_FLOAT* val   = m->val;
-  for (size_t rowID = 0; rowID < numRows; rowID++) {
+  for (int rowID = 0; rowID < numRows; rowID++) {
    CG_FLOAT tmp = y[rowID];
    // loop over all elements in row
-    for (size_t rowEntry = rowPtr[rowID]; rowEntry < rowPtr[rowID + 1];
+    for (int rowEntry = rowPtr[rowID]; rowEntry < rowPtr[rowID + 1];
        rowEntry++) {
      tmp += val[rowEntry] * x[colInd[rowEntry]];
    }
@@ -160,15 +160,15 @@ void waxpby(const CG_UINT n,
    CG_FLOAT* const w)
 {
  if (alpha == 1.0) {
-    for (size_t i = 0; i < n; i++) {
+    for (int i = 0; i < n; i++) {
      w[i] = x[i] + beta * y[i];
    }
  } else if (beta == 1.0) {
-    for (size_t i = 0; i < n; i++) {
+    for (int i = 0; i < n; i++) {
      w[i] = alpha * x[i] + y[i];
    }
  } else {
-    for (size_t i = 0; i < n; i++) {
+    for (int i = 0; i < n; i++) {
      w[i] = alpha * x[i] + beta * y[i];
    }
  }
@@ -182,11 +182,11 @@ void ddot(const CG_UINT n,
  CG_FLOAT sum = 0.0;
  if (y == x) {
-    for (size_t i = 0; i < n; i++) {
+    for (int i = 0; i < n; i++) {
      sum += x[i] * x[i];
    }
  } else {
-    for (size_t i = 0; i < n; i++) {
+    for (int i = 0; i < n; i++) {
      sum += x[i] * y[i];
    }
  }
--- a/src/util.h
+++ b/src/util.h
@@ -23,13 +23,16 @@
 #define MAXLINE 4096
 #endif
-#define CG_UINT unsigned long long int
+// #define CG_UINT unsigned long long int
 #define CG_UINT int
 #if PRECISION == 1
 #define CG_FLOAT         float
 #define MPI_FLOAT_TYPE   MPI_FLOAT
 #define PRECISION_STRING "single"
 #else
 #define CG_FLOAT         double
 #define MPI_FLOAT_TYPE   MPI_DOUBLE
 #define PRECISION_STRING "double"
 #endif