Seuquential version finished. Start with MPI stuff

Does not compile.
2025-01-06 12:17:19 +01:00
parent 3428d84c3d
commit 512e2903c8
14 changed files with 874 additions and 203 deletions
--- a/2
+++ b/2
@@ -27,7 +27,7 @@ clist = $(subst $(eval) ,$c,$(strip $1))

 define CLANGD_TEMPLATE
 CompileFlags:
-  Add: [$(call clist,$(INCLUDES)), $(call clist,$(CPPFLAGS))]
+  Add: [$(call clist,$(INCLUDES)), $(call clist,$(DEFINES)), $(call clist,$(OPTIONS)), -xc]
  Compiler: clang
 endef

--- a/config.mk
+++ b/config.mk
@@ -1,9 +1,11 @@
 # Supported: GCC, CLANG, ICC
 TOOLCHAIN ?= CLANG
+ENABLE_MPI ?= true
 # ENABLE_OPENMP ?= false

 #Feature options
 OPTIONS +=  -DARRAY_ALIGNMENT=64
+#OPTIONS +=  -DVERBOSE
 #OPTIONS +=  -DVERBOSE_AFFINITY
 #OPTIONS +=  -DVERBOSE_DATASIZE
 #OPTIONS +=  -DVERBOSE_TIMER
--- a/hpcg.par
+++ b/hpcg.par
@@ -0,0 +1,11 @@
+#==============================================================================
+#                              HPCG 27pt stencil
+#==============================================================================
+
+filename generate
+nx 50
+ny 50
+nz 50
+
+itermax 10000
+eps 0.0001
--- a/mk/include_CLANG.mk
+++ b/mk/include_CLANG.mk
@@ -1,7 +1,13 @@
+ifeq ($(strip $(ENABLE_MPI)),true)
 CC   = mpicc
+DEFINES  = -D_MPI
+else
+CC = clang
+endif
+
 LD   = $(CC)

-ifeq ($(ENABLE_OPENMP),true)
+ifeq ($(strip $(ENABLE_OPENMP)),true)
 OPENMP   = -fopenmp
 #OPENMP   = -Xpreprocessor -fopenmp #required on Macos with homebrew libomp
 LIBS     = # -lomp
@@ -11,5 +17,5 @@ VERSION  = --version
 CFLAGS   = -Ofast -std=c99 $(OPENMP)
 #CFLAGS   = -Ofast -fnt-store=aggressive  -std=c99 $(OPENMP) #AMD CLANG
 LFLAGS   = $(OPENMP)
-DEFINES  = -D_GNU_SOURCE
-INCLUDES = -I~/.local/include
+DEFINES  += -D_GNU_SOURCE
+INCLUDES = -I/Users/jan/.local/include
--- a/src/comm.c
+++ b/src/comm.c
@@ -1,14 +1,23 @@
-/*
- * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+/* Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
 * All rights reserved. This file is part of nusif-solver.
 * Use of this source code is governed by a MIT style
 * license that can be found in the LICENSE file.
 */
+#include "util.h"
 #include <stdio.h>
 #include <stdlib.h>

+#ifdef _MPI
+#include <mpi.h>
+#endif
+
+#include "allocate.h"
 #include "comm.h"

+#define MAX_EXTERNAL       100000
+#define MAX_NUM_MESSAGES   500
+#define MAX_NUM_NEIGHBOURS MAX_NUM_MESSAGES
+
 // subroutines local to this module
 int sizeOfRank(int rank, int size, int N)
 {
@@ -26,6 +35,528 @@ void commReduction(double* v, int op)
 #endif
 }

+void commPartition(Comm* c, Matrix* A)
+{
+    int rank      = c->rank;
+    int size      = c->size;
+    MPI_Comm comm = c->comm;
+
+    // Extract Matrix pieces
+    int start_row       = A->startRow;
+    int stop_row        = A->stopRow;
+    int total_nrow      = A->totalNr;
+    long long total_nnz = A->totalNnz;
+    int local_nrow      = A->nr;
+    int local_nnz       = A->nnz;
+    int* row_ptr        = (int*)A->rowPtr;
+    int* col_ind        = (int*)A->colInd;
+
+    // int* nnz_in_row             = A->nnz_in_row;
+    // double** ptr_to_vals_in_row = A->ptr_to_vals_in_row;
+    // int** ptr_to_inds_in_row    = A->ptr_to_inds_in_row;
+
+    // We need to convert the index values for the rows on this processor
+    // to a local index space. We need to:
+    // - Determine if each index reaches to a local value or external value
+    // - If local, subtract start_row from index value to get local index
+    // - If external, find out if it is already accounted for.
+    //   - If so, then do nothing,
+    //   - otherwise
+    //     - add it to the list of external indices,
+    //     - find out which processor owns the value.
+    //     - Set up communication for sparse MV operation.
+
+    ///////////////////////////////////////////
+    // Scan the indices and transform to local
+    ///////////////////////////////////////////
+
+    int* external_index = (int*)allocate(ARRAY_ALIGNMENT, MAX_EXTERNAL * sizeof(int));
+    int* externals      = (int*)allocate(ARRAY_ALIGNMENT, A->totalNr * sizeof(int));
+    int num_external    = 1;
+
+    c->external_index = external_index;
+
+    for (int i = 0; i < A->totalNr; i++) {
+        externals[i] = -1;
+    }
+
+    for (int i = 0; i < local_nrow; i++) {
+        for (int j = row_ptr[i]; j < row_ptr[i + 1]; j++) {
+            int cur_ind = A->colInd[j];
+
+#ifdef VERBOSE
+            printf("Process %d of %d getting index %d in local row %d\n",
+                rank,
+                size,
+                cur_ind,
+                i);
+#endif
+
+            // shift local rows to the start
+            if (start_row <= cur_ind && cur_ind <= stop_row) {
+                col_ind[j] -= start_row;
+            } else // Must find out if we have already set up this point
+            {
+                if (externals[cur_ind] == -1) {
+                    externals[cur_ind] = num_external++;
+
+                    if (num_external <= MAX_EXTERNAL) {
+                        external_index[num_external - 1] = cur_ind;
+                        // Mark index as external by negating it
+                        ptr_to_inds_in_row[i][j] = -(ptr_to_inds_in_row[i][j] + 1);
+                    } else {
+                        cerr << "Must increase MAX_EXTERNAL in HPC_Sparse_Matrix.hpp"
+                             << endl;
+                        abort();
+                    }
+                } else {
+                    // Mark index as external by adding 1 and negating it
+                    ptr_to_inds_in_row[i][j] = -(ptr_to_inds_in_row[i][j] + 1);
+                }
+            }
+        }
+    }
+
+    ////////////////////////////////////////////////////////////////////////////
+    // Go through list of externals to find out which processors must be accessed.
+    ////////////////////////////////////////////////////////////////////////////
+
+    A->num_external = num_external;
+    int* tmp_buffer = new int[size]; // Temp buffer space needed below
+
+    // Build list of global index offset
+
+    int* global_index_offsets = new int[size];
+    for (i = 0; i < size; i++)
+        tmp_buffer[i] = 0; // First zero out
+
+    tmp_buffer[rank] = start_row; // This is my start row
+
+    // This call sends the start_row of each ith processor to the ith
+    // entry of global_index_offset on all processors.
+    // Thus, each processor know the range of indices owned by all
+    // other processors.
+    // Note:  There might be a better algorithm for doing this, but this
+    //        will work...
+
+    MPI_Allreduce(tmp_buffer,
+        global_index_offsets,
+        size,
+        MPI_INT,
+        MPI_SUM,
+        MPI_COMM_WORLD);
+
+    // Go through list of externals and find the processor that owns each
+    int* external_processor     = new int[num_external];
+    int* new_external_processor = new int[num_external];
+
+    for (i = 0; i < num_external; i++) {
+        int cur_ind = external_index[i];
+        for (int j = size - 1; j >= 0; j--)
+            if (global_index_offsets[j] <= cur_ind) {
+                external_processor[i] = j;
+                break;
+            }
+    }
+    if (debug) {
+        t0 = mytimer() - t0;
+        cout << "          Time in finding processors phase = " << t0 << endl;
+    }
+
+    ////////////////////////////////////////////////////////////////////////////
+    // Sift through the external elements. For each newly encountered external
+    // point assign it the next index in the sequence. Then look for other
+    // external elements who are update by the same node and assign them the next
+    // set of index numbers in the sequence (ie. elements updated by the same node
+    // have consecutive indices).
+    ////////////////////////////////////////////////////////////////////////////
+
+    if (debug) t0 = mytimer();
+
+    int count = local_nrow;
+    for (i = 0; i < num_external; i++)
+        external_local_index[i] = -1;
+
+    for (i = 0; i < num_external; i++) {
+        if (external_local_index[i] == -1) {
+            external_local_index[i] = count++;
+
+            for (j = i + 1; j < num_external; j++) {
+                if (external_processor[j] == external_processor[i])
+                    external_local_index[j] = count++;
+            }
+        }
+    }
+
+    if (debug) {
+        t0 = mytimer() - t0;
+        cout << "           Time in scanning external indices phase = " << t0 << endl;
+    }
+    if (debug) t0 = mytimer();
+
+    for (i = 0; i < local_nrow; i++) {
+        for (j = 0; j < nnz_in_row[i]; j++) {
+            if (ptr_to_inds_in_row[i][j] < 0) // Change index values of externals
+            {
+                int cur_ind              = -ptr_to_inds_in_row[i][j] - 1;
+                ptr_to_inds_in_row[i][j] = external_local_index[externals[cur_ind]];
+            }
+        }
+    }
+
+    for (i = 0; i < num_external; i++)
+        new_external_processor[i] = 0;
+
+    for (i = 0; i < num_external; i++)
+        new_external_processor[external_local_index[i] - local_nrow] =
+            external_processor[i];
+
+    if (debug) {
+        t0 = mytimer() - t0;
+        cout << "           Time in assigning external indices phase = " << t0 << endl;
+    }
+
+    if (debug_details) {
+        for (i = 0; i < num_external; i++) {
+            cout << "Processor " << rank << " of " << size << ": external processor[" << i
+                 << "] = " << external_processor[i] << endl;
+            cout << "Processor " << rank << " of " << size << ": new external processor["
+                 << i << "] = " << new_external_processor[i] << endl;
+        }
+    }
+
+    ////////////////////////////////////////////////////////////////////////////
+    ///
+    // Count the number of neighbors from which we receive information to update
+    // our external elements. Additionally, fill the array tmp_neighbors in the
+    // following way:
+    //      tmp_neighbors[i] = 0   ==>  No external elements are updated by
+    //                              processor i.
+    //      tmp_neighbors[i] = x   ==>  (x-1)/size elements are updated from
+    //                              processor i.
+    ///
+    ////////////////////////////////////////////////////////////////////////////
+
+    t0                 = mytimer();
+    int* tmp_neighbors = new int[size];
+    for (i = 0; i < size; i++)
+        tmp_neighbors[i] = 0;
+
+    int num_recv_neighbors = 0;
+    int length             = 1;
+
+    for (i = 0; i < num_external; i++) {
+        if (tmp_neighbors[new_external_processor[i]] == 0) {
+            num_recv_neighbors++;
+            tmp_neighbors[new_external_processor[i]] = 1;
+        }
+        tmp_neighbors[new_external_processor[i]] += size;
+    }
+
+    /// sum over all processors all the tmp_neighbors arrays ///
+
+    MPI_Allreduce(tmp_neighbors, tmp_buffer, size, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+
+    /// decode the combined 'tmp_neighbors' (stored in tmp_buffer)
+    //  array from all the processors
+
+    int num_send_neighbors = tmp_buffer[rank] % size;
+
+    /// decode 'tmp_buffer[rank] to deduce total number of elements
+    //  we must send
+
+    int total_to_be_sent = (tmp_buffer[rank] - num_send_neighbors) / size;
+
+    //
+    // Check to see if we have enough workspace allocated.  This could be
+    // dynamically modified, but let's keep it simple for now...
+    //
+
+    if (num_send_neighbors > MAX_NUM_MESSAGES) {
+        cerr << "Must increase MAX_NUM_MESSAGES in HPC_Sparse_Matrix.hpp" << endl;
+        cerr << "Must be at least " << num_send_neighbors << endl;
+        abort();
+    }
+
+    if (total_to_be_sent > MAX_EXTERNAL) {
+        cerr << "Must increase MAX_EXTERNAL in HPC_Sparse_Matrix.hpp" << endl;
+        cerr << "Must be at least " << total_to_be_sent << endl;
+        abort();
+    }
+    delete[] tmp_neighbors;
+
+    if (debug) {
+        t0 = mytimer() - t0;
+        cout << "           Time in finding neighbors phase = " << t0 << endl;
+    }
+    if (debug)
+        cout << "Processor " << rank << " of " << size
+             << ": Number of send neighbors = " << num_send_neighbors << endl;
+
+    if (debug)
+        cout << "Processor " << rank << " of " << size
+             << ": Number of receive neighbors = " << num_recv_neighbors << endl;
+
+    if (debug)
+        cout << "Processor " << rank << " of " << size
+             << ": Total number of elements to send = " << total_to_be_sent << endl;
+
+    if (debug) MPI_Barrier(MPI_COMM_WORLD);
+
+    /////////////////////////////////////////////////////////////////////////
+    ///
+    // Make a list of the neighbors that will send information to update our
+    // external elements (in the order that we will receive this information).
+    ///
+    /////////////////////////////////////////////////////////////////////////
+
+    int* recv_list = new int[MAX_EXTERNAL];
+
+    j              = 0;
+    recv_list[j++] = new_external_processor[0];
+    for (i = 1; i < num_external; i++) {
+        if (new_external_processor[i - 1] != new_external_processor[i]) {
+            recv_list[j++] = new_external_processor[i];
+        }
+    }
+
+    //
+    // Send a 0 length message to each of our recv neighbors
+    //
+
+    int* send_list = new int[num_send_neighbors];
+    for (i = 0; i < num_send_neighbors; i++)
+        send_list[i] = 0;
+
+    //
+    //  first post receives, these are immediate receives
+    //  Do not wait for result to come, will do that at the
+    //  wait call below.
+    //
+    int MPI_MY_TAG = 99;
+
+    MPI_Request* request = new MPI_Request[MAX_NUM_MESSAGES];
+    for (i = 0; i < num_send_neighbors; i++) {
+        MPI_Irecv(tmp_buffer + i,
+            1,
+            MPI_INT,
+            MPI_ANY_SOURCE,
+            MPI_MY_TAG,
+            MPI_COMM_WORLD,
+            request + i);
+    }
+
+    // send messages
+
+    for (i = 0; i < num_recv_neighbors; i++)
+        MPI_Send(tmp_buffer + i, 1, MPI_INT, recv_list[i], MPI_MY_TAG, MPI_COMM_WORLD);
+    ///
+    // Receive message from each send neighbor to construct 'send_list'.
+    ///
+
+    MPI_Status status;
+    for (i = 0; i < num_send_neighbors; i++) {
+        if (MPI_Wait(request + i, &status)) {
+            cerr << "MPI_Wait error\n" << endl;
+            exit(-1);
+        }
+        send_list[i] = status.MPI_SOURCE;
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    ///
+    //  Compare the two lists. In most cases they should be the same.
+    //  However, if they are not then add new entries to the recv list
+    //  that are in the send list (but not already in the recv list).
+    ///
+    /////////////////////////////////////////////////////////////////////////
+
+    for (j = 0; j < num_send_neighbors; j++) {
+        int found = 0;
+        for (i = 0; i < num_recv_neighbors; i++) {
+            if (recv_list[i] == send_list[j]) found = 1;
+        }
+
+        if (found == 0) {
+            if (debug)
+                cout << "Processor " << rank << " of " << size << ": recv_list["
+                     << num_recv_neighbors << "] = " << send_list[j] << endl;
+            recv_list[num_recv_neighbors] = send_list[j];
+            (num_recv_neighbors)++;
+        }
+    }
+
+    delete[] send_list;
+    num_send_neighbors = num_recv_neighbors;
+
+    if (num_send_neighbors > MAX_NUM_MESSAGES) {
+        cerr << "Must increase MAX_EXTERNAL in HPC_Sparse_Matrix.hpp" << endl;
+        abort();
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    /// Start filling HPC_Sparse_Matrix struct
+    /////////////////////////////////////////////////////////////////////////
+
+    A->total_to_be_sent   = total_to_be_sent;
+    int* elements_to_send = new int[total_to_be_sent];
+    A->elements_to_send   = elements_to_send;
+
+    for (i = 0; i < total_to_be_sent; i++)
+        elements_to_send[i] = 0;
+
+    //
+    // Create 'new_external' which explicitly put the external elements in the
+    // order given by 'external_local_index'
+    //
+
+    int* new_external = new int[num_external];
+    for (i = 0; i < num_external; i++) {
+        new_external[external_local_index[i] - local_nrow] = external_index[i];
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    //
+    // Send each processor the global index list of the external elements in the
+    // order that I will want to receive them when updating my external elements
+    //
+    /////////////////////////////////////////////////////////////////////////
+
+    int* lengths = new int[num_recv_neighbors];
+
+    MPI_MY_TAG++;
+
+    // First post receives
+
+    for (i = 0; i < num_recv_neighbors; i++) {
+        int partner = recv_list[i];
+        MPI_Irecv(lengths + i,
+            1,
+            MPI_INT,
+            partner,
+            MPI_MY_TAG,
+            MPI_COMM_WORLD,
+            request + i);
+    }
+
+    int* neighbors   = new int[MAX_NUM_NEIGHBOURS];
+    int* recv_length = new int[MAX_NUM_NEIGHBOURS];
+    int* send_length = new int[MAX_NUM_NEIGHBOURS];
+
+    A->neighbors   = neighbors;
+    A->recv_length = recv_length;
+    A->send_length = send_length;
+
+    j = 0;
+    for (i = 0; i < num_recv_neighbors; i++) {
+        int start     = j;
+        int newlength = 0;
+
+        // go through list of external elements until updating
+        // processor changes
+
+        while ((j < num_external) && (new_external_processor[j] == recv_list[i])) {
+            newlength++;
+            j++;
+            if (j == num_external) break;
+        }
+
+        recv_length[i] = newlength;
+        neighbors[i]   = recv_list[i];
+
+        length = j - start;
+        MPI_Send(&length, 1, MPI_INT, recv_list[i], MPI_MY_TAG, MPI_COMM_WORLD);
+    }
+
+    // Complete the receives of the number of externals
+
+    for (i = 0; i < num_recv_neighbors; i++) {
+        if (MPI_Wait(request + i, &status)) {
+            cerr << "MPI_Wait error\n" << endl;
+            exit(-1);
+        }
+        send_length[i] = lengths[i];
+    }
+    delete[] lengths;
+
+    ///////////////////////////////////////////////////////////////////
+    // Build "elements_to_send" list.  These are the x elements I own
+    // that need to be sent to other processors.
+    ///////////////////////////////////////////////////////////////////
+
+    MPI_MY_TAG++;
+
+    j = 0;
+    for (i = 0; i < num_recv_neighbors; i++) {
+        MPI_Irecv(elements_to_send + j,
+            send_length[i],
+            MPI_INT,
+            neighbors[i],
+            MPI_MY_TAG,
+            MPI_COMM_WORLD,
+            request + i);
+        j += send_length[i];
+    }
+
+    j = 0;
+    for (i = 0; i < num_recv_neighbors; i++) {
+        int start     = j;
+        int newlength = 0;
+
+        // Go through list of external elements
+        // until updating processor changes.  This is redundant, but
+        // saves us from recording this information.
+
+        while ((j < num_external) && (new_external_processor[j] == recv_list[i])) {
+
+            newlength++;
+            j++;
+            if (j == num_external) break;
+        }
+        MPI_Send(new_external + start,
+            j - start,
+            MPI_INT,
+            recv_list[i],
+            MPI_MY_TAG,
+            MPI_COMM_WORLD);
+    }
+
+    // receive from each neighbor the global index list of external elements
+
+    for (i = 0; i < num_recv_neighbors; i++) {
+        if (MPI_Wait(request + i, &status)) {
+            cerr << "MPI_Wait error\n" << endl;
+            exit(-1);
+        }
+    }
+
+    /// replace global indices by local indices ///
+
+    for (i = 0; i < total_to_be_sent; i++)
+        elements_to_send[i] -= start_row;
+
+    ////////////////
+    // Finish up !!
+    ////////////////
+
+    A->num_send_neighbors = num_send_neighbors;
+    A->local_ncol         = A->local_nrow + num_external;
+
+    // Used in exchange_externals
+    double* send_buffer = new double[total_to_be_sent];
+    A->send_buffer      = send_buffer;
+
+    delete[] tmp_buffer;
+    delete[] global_index_offsets;
+    delete[] recv_list;
+    delete[] external_processor;
+    delete[] new_external;
+    delete[] new_external_processor;
+    delete[] request;
+
+    return;
+}
+
 void commPrintConfig(Comm* c)
 {
 #ifdef _MPI
@@ -70,57 +601,6 @@ void commInit(Comm* c, int argc, char** argv)
 #endif
 }

-void commTestInit(Comm* c, double* p, double* f, double* g)
-{
-    int imax = c->imaxLocal;
-    int jmax = c->jmaxLocal;
-    int rank = c->rank;
-
-    for (int j = 0; j < jmax + 2; j++) {
-        for (int i = 0; i < imax + 2; i++) {
-            p[j * (imax + 2) + i] = rank;
-            f[j * (imax + 2) + i] = rank;
-            g[j * (imax + 2) + i] = rank;
-        }
-    }
-}
-
-static void testWriteFile(char* filename, double* grid, int imax, int jmax)
-{
-    FILE* fp = fopen(filename, "w");
-
-    if (fp == NULL) {
-        printf("Error!\n");
-        exit(EXIT_FAILURE);
-    }
-
-    for (int j = 0; j < jmax + 2; j++) {
-        for (int i = 0; i < imax + 2; i++) {
-            fprintf(fp, "%.2f ", grid[j * (imax + 2) + i]);
-        }
-        fprintf(fp, "\n");
-    }
-
-    fclose(fp);
-}
-
-void commTestWrite(Comm* c, double* p, double* f, double* g)
-{
-    int imax = c->imaxLocal;
-    int jmax = c->jmaxLocal;
-    int rank = c->rank;
-
-    char filename[30];
-    snprintf(filename, 30, "ptest-%d.dat", rank);
-    testWriteFile(filename, p, imax, jmax);
-
-    snprintf(filename, 30, "ftest-%d.dat", rank);
-    testWriteFile(filename, f, imax, jmax);
-
-    snprintf(filename, 30, "gtest-%d.dat", rank);
-    testWriteFile(filename, g, imax, jmax);
-}
-
 void commFinalize(Comm* c)
 {
 #ifdef _MPI
--- a/src/comm.h
+++ b/src/comm.h
@@ -8,10 +8,8 @@
 #include <mpi.h>
 #endif

-enum direction { LEFT = 0, RIGHT, BOTTOM, TOP, NDIRS };
-enum dimension { IDIM = 0, JDIM, NDIMS };
-enum cdimension { CJDIM = 0, CIDIM };
-enum layer { HALO = 0, BULK };
+#include "matrix.h"
+
 enum op { MAX = 0, SUM };

 typedef struct {
@@ -19,37 +17,27 @@ typedef struct {
    int size;
 #if defined(_MPI)
    MPI_Comm comm;
-    MPI_Datatype bufferTypes[NDIRS];
-    MPI_Aint sdispls[NDIRS];
-    MPI_Aint rdispls[NDIRS];
+
+    int num_external;
+    int num_send_neighbors;
+    int* external_index;
+    int* external_local_index;
+    int total_to_be_sent;
+    int* elements_to_send;
+    int* neighbors;
+    int* recv_length;
+    int* send_length;
+    double* send_buffer;
 #endif
-    int neighbours[NDIRS];
-    int coords[NDIMS], dims[NDIMS];
-    int imaxLocal, jmaxLocal;
 } Comm;

 extern int sizeOfRank(int rank, int size, int N);
 extern void commInit(Comm* c, int argc, char** argv);
-extern void commTestInit(Comm* c, double* p, double* f, double* g);
-extern void commTestWrite(Comm* c, double* p, double* f, double* g);
 extern void commFinalize(Comm* c);
-extern void commPartition(Comm* c, int jmax, int imax);
+extern void commPartition(Comm*, Matrix* m);
 extern void commPrintConfig(Comm*);
 extern void commExchange(Comm*, double*);
-extern void commShift(Comm* c, double* f, double* g);
 extern void commReduction(double* v, int op);
-extern int commIsBoundary(Comm* c, int direction);
-extern void commUpdateDatatypes(Comm*, Comm*, int, int);
-extern void commFreeCommunicator(Comm*);
-extern void commCollectResult(Comm* c,
-    double* ug,
-    double* vg,
-    double* pg,
-    double* u,
-    double* v,
-    double* p,
-    int jmax,
-    int imax);

 static inline int commIsMaster(Comm* c) { return c->rank == 0; }
 #endif // __COMM_H_
--- a/src/main.c
+++ b/src/main.c
@@ -2,61 +2,111 @@
 * All rights reserved.
 * Use of this source code is governed by a MIT-style
 * license that can be found in the LICENSE file.*/
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>

+#include "allocate.h"
 #include "comm.h"
 #include "matrix.h"
-#include "progress.h"
+#include "parameter.h"
 #include "solver.h"
 #include "timing.h"
+#include "util.h"
+
+CG_FLOAT compute_residual(Solver* s)
+{
+    CG_FLOAT residual = 0.0;
+    int n             = s->A.nr;
+    CG_FLOAT* v1      = s->x;
+    CG_FLOAT* v2      = s->xexact;
+
+    for (int i = 0; i < n; i++) {
+        double diff = fabs(v1[i] - v2[i]);
+        if (diff > residual) residual = diff;
+    }
+
+    commReduction(&residual, MAX);
+
+    return residual;
+}

 int main(int argc, char** argv)
 {
    int rank;
    double timeStart, timeStop;
-    Matrix m;
+    Parameter param;
    Solver s;
+    Comm comm;

-    // commInit(s.comm, argc, argv);
-
-    // FILE* fp;
-    // if (commIsMaster(s.comm)) fp = initResidualWriter();
+    commInit(&comm, argc, argv);
+    initParameter(&param);

    if (argc != 2) {
        printf("Usage: %s <configFile>\n", argv[0]);
        exit(EXIT_SUCCESS);
    }

-    matrixRead(&m, argv[1]);
-    matrixDump(&m);
-    exit(EXIT_SUCCESS);
+    readParameter(&param, argv[1]);

-    double te  = 1000;
-    double t   = 0.0;
-    double res = 0.0;
+    CG_FLOAT eps = (CG_FLOAT)param.eps;
+    int itermax  = param.itermax;
+    initSolver(&s, &param);
+    // matrixDump(&s.A);

+    CG_UINT nrow    = s.A.nr;
+    CG_UINT ncol    = s.A.nc;
+    CG_FLOAT* r     = (CG_FLOAT*)allocate(64, nrow * sizeof(CG_FLOAT));
+    CG_FLOAT* p     = (CG_FLOAT*)allocate(64, ncol * sizeof(CG_FLOAT));
+    CG_FLOAT* Ap    = (CG_FLOAT*)allocate(64, nrow * sizeof(CG_FLOAT));
+    CG_FLOAT normr  = 0.0;
+    CG_FLOAT rtrans = 0.0, oldrtrans;
+
+    waxpby(nrow, 1.0, s.x, 0.0, s.x, p);
+    spMVM(&s.A, p, Ap);
+    waxpby(nrow, 1.0, s.b, -1.0, Ap, r);
+    ddot(nrow, r, r, &rtrans);
+
+    normr = sqrt(rtrans);
+
+    // initial iteration
+    waxpby(nrow, 1.0, r, 0.0, r, p);
+
+    // TICK(); exchange_externals(A,p); TOCK(t5);
+    spMVM(&s.A, p, Ap);
+    double alpha = 0.0;
+    ddot(nrow, p, Ap, &alpha);
+    alpha = rtrans / alpha;
+    waxpby(nrow, 1.0, s.x, alpha, p, s.x);
+    waxpby(nrow, 1.0, r, -alpha, Ap, r);
+
+    int k;
    timeStart = getTimeStamp();
-    while (t <= te) {
-
-        // if (commIsMaster(s.comm)) writeResidual(fp, t, res);
-
-        t += 0.5;
-
-#ifdef VERBOSE
-        if (commIsMaster(s.comm)) {
-            printf("TIME %f , TIMESTEP %f\n", t, d.dt);
-        }
-#else
-        printProgress(t);
-#endif
+    for (k = 1; k < itermax && normr > eps; k++) {
+        oldrtrans = rtrans;
+        ddot(nrow, r, r, &rtrans);
+        double beta = rtrans / oldrtrans;
+        waxpby(nrow, 1.0, r, beta, p, p);
+        // TICK(); exchange_externals(A,p); TOCK(t5);
+        spMVM(&s.A, p, Ap);
+        alpha = 0.0;
+        ddot(nrow, p, Ap, &alpha);
+        alpha = rtrans / alpha;
+        waxpby(nrow, 1.0, s.x, alpha, p, s.x);
+        waxpby(nrow, 1.0, r, -alpha, Ap, r);
    }
    timeStop = getTimeStamp();
-    if (commIsMaster(s.comm)) {
-        printf("Solution took %.2fs\n", timeStop - timeStart);
+
+    double residual = compute_residual(&s);
+
+    if (commIsMaster(&comm)) {
+        printf("Solution performed %d iterations and took %.2fs\n",
+            k,
+            timeStop - timeStart);
+        printf("Difference between computed and exact  = %f\n", residual);
    }
-    // if (commIsMaster(s.comm)) fclose(fp);
-    commFinalize(s.comm);
+
+    commFinalize(&comm);
    return EXIT_SUCCESS;
 }
--- a/src/matrix.c
+++ b/src/matrix.c
@@ -94,7 +94,7 @@ void matrixRead(Matrix* m, char* filename)
    Entry* mm;

    if (sym_flag) {
-        mm = (Entry*)allocate(64, nz * 2 * sizeof(Entry));
+        mm = (Entry*)allocate(ARRAY_ALIGNMENT, nz * 2 * sizeof(Entry));
    } else {
        mm = (Entry*)allocate(64, nz * sizeof(Entry));
    }
--- a/src/matrix.h
+++ b/src/matrix.h
@@ -4,6 +4,7 @@
 * license that can be found in the LICENSE file.*/
 #ifndef __MATRIX_H_
 #define __MATRIX_H_
+#include <stdbool.h>

 #include "util.h"

@@ -14,7 +15,9 @@ typedef struct {
 } Entry;

 typedef struct {
-    CG_UINT nr, nnz;          // number of rows and non zeros
+    CG_UINT nr, nc, nnz;       // number of rows, columns and non zeros
+    CG_UINT totalNr, totalNnz; // number of rows and non zeros
+    CG_UINT startRow, stopRow;
    CG_UINT *colInd, *rowPtr; // colum Indices, row Pointer
    CG_FLOAT* val;
 } Matrix;
--- a/src/parameter.c
+++ b/src/parameter.c
@@ -1,9 +1,7 @@
-/*
- * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+/* Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
 * All rights reserved. This file is part of nusif-solver.
 * Use of this source code is governed by a MIT style
- * license that can be found in the LICENSE file.
- */
+ * license that can be found in the LICENSE file. */
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -14,16 +12,12 @@

 void initParameter(Parameter* param)
 {
-    param->xlength    = 1.0;
-    param->ylength    = 1.0;
-    param->imax       = 100;
-    param->jmax       = 100;
-    param->itermax    = 1000;
-    param->eps        = 0.0001;
-    param->omg        = 1.8;
-    param->levels     = 5;
-    param->presmooth  = 5;
-    param->postsmooth = 5;
+    param->filename = "generate";
+    param->nx       = 100;
+    param->ny       = 100;
+    param->nz       = 1000;
+    param->itermax  = 1000;
+    param->eps      = 0.0001;
 }

 void readParameter(Parameter* param, const char* filename)
@@ -56,31 +50,11 @@ void readParameter(Parameter* param, const char* filename)
 #define PARSE_REAL(p)   PARSE_PARAM(p, atof)

        if (tok != NULL && val != NULL) {
-            PARSE_REAL(xlength);
-            PARSE_REAL(ylength);
-            PARSE_INT(imax);
-            PARSE_INT(jmax);
+            PARSE_INT(nx);
+            PARSE_INT(ny);
+            PARSE_INT(nz);
            PARSE_INT(itermax);
            PARSE_REAL(eps);
-            PARSE_REAL(omg);
-            PARSE_REAL(re);
-            PARSE_REAL(tau);
-            PARSE_REAL(gamma);
-            PARSE_REAL(dt);
-            PARSE_REAL(te);
-            PARSE_REAL(gx);
-            PARSE_REAL(gy);
-            PARSE_STRING(name);
-            PARSE_INT(bcLeft);
-            PARSE_INT(bcRight);
-            PARSE_INT(bcBottom);
-            PARSE_INT(bcTop);
-            PARSE_INT(levels);
-            PARSE_INT(presmooth);
-            PARSE_INT(postsmooth);
-            PARSE_REAL(u_init);
-            PARSE_REAL(v_init);
-            PARSE_REAL(p_init);
        }
    }

@@ -89,26 +63,10 @@ void readParameter(Parameter* param, const char* filename)

 void printParameter(Parameter* param)
 {
-    printf("Parameters for %s\n", param->name);
-    printf("Boundary conditions Left:%d Right:%d Bottom:%d Top:%d\n",
-        param->bcLeft,
-        param->bcRight,
-        param->bcBottom,
-        param->bcTop);
-    printf("\tReynolds number: %.2f\n", param->re);
-    printf("\tInit arrays: U:%.2f V:%.2f P:%.2f\n",
-        param->u_init,
-        param->v_init,
-        param->p_init);
+    printf("Parameters\n");
    printf("Geometry data:\n");
-    printf("\tDomain box size (x, y): %.2f, %.2f\n", param->xlength, param->ylength);
-    printf("\tCells (x, y): %d, %d\n", param->imax, param->jmax);
-    printf("Timestep parameters:\n");
-    printf("\tDefault stepsize: %.2f, Final time %.2f\n", param->dt, param->te);
-    printf("\tTau factor: %.2f\n", param->tau);
+    printf("\tPoints (x, y, z): %d, %d, %d\n", param->nx, param->ny, param->nz);
    printf("Iterative solver parameters:\n");
    printf("\tMax iterations: %d\n", param->itermax);
    printf("\tepsilon (stopping tolerance) : %f\n", param->eps);
-    printf("\tgamma (stopping tolerance) : %f\n", param->gamma);
-    printf("\tomega (SOR relaxation): %f\n", param->omg);
 }
--- a/src/parameter.h
+++ b/src/parameter.h
@@ -1,24 +1,15 @@
-/*
- * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+/* Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
 * All rights reserved. This file is part of nusif-solver.
 * Use of this source code is governed by a MIT style
- * license that can be found in the LICENSE file.
- */
+ * license that can be found in the LICENSE file. */
 #ifndef __PARAMETER_H_
 #define __PARAMETER_H_

 typedef struct {
-    double xlength, ylength;
-    int imax, jmax;
+    char* filename;
+    int nx, ny, nz;
    int itermax;
-    double eps, omg;
-    double re, tau, gamma;
-    double te, dt;
-    double gx, gy;
-    char* name;
-    int bcLeft, bcRight, bcBottom, bcTop;
-    double u_init, v_init, p_init;
-    int levels, presmooth, postsmooth;
+    double eps;
 } Parameter;

 void initParameter(Parameter*);
--- a/src/progress.c
+++ b/src/progress.c
@@ -1,15 +1,12 @@
-/*
- * Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
+/* Copyright (C)  NHR@FAU, University Erlangen-Nuremberg.
 * All rights reserved. This file is part of nusif-solver.
 * Use of this source code is governed by a MIT style
- * license that can be found in the LICENSE file.
- */
-#include <math.h>
-#include <mpi.h>
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
+ * license that can be found in the LICENSE file. */
 #include "progress.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>

 static double _end;
 static int _current;
--- a/src/solver.c
+++ b/src/solver.c
@@ -2,13 +2,147 @@
 * All rights reserved.
 * Use of this source code is governed by a MIT-style
 * license that can be found in the LICENSE file.*/
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
 #include <stdlib.h>
+#include <string.h>

+#include "allocate.h"
+#include "comm.h"
 #include "matrix.h"
 #include "solver.h"
 #include "util.h"

-void spMVM(Matrix* m, double* x, double* y)
+static void matrixGenerate(Parameter* p, Solver* s, bool use_7pt_stencil)
+{
+    int size = 1; // Serial case (not using MPI)
+    int rank = 0;
+
+    CG_UINT local_nrow = p->nx * p->ny * p->nz;
+    CG_UINT local_nnz  = 27 * local_nrow;
+
+    CG_UINT total_nrow = local_nrow * size;
+    CG_UINT total_nnz  = 27 * total_nrow;
+
+    int start_row = local_nrow * rank;
+    int stop_row  = start_row + local_nrow - 1;
+
+    if (use_7pt_stencil) {
+        printf("Generate 7pt matrix with ");
+    } else {
+        printf("Generate 27pt matrix with ");
+    }
+    printf("%d total rows and %d nonzeros\n", (int)total_nrow, (int)local_nnz);
+
+    s->A.val    = (CG_FLOAT*)allocate(64, local_nnz * sizeof(CG_FLOAT));
+    s->A.colInd = (CG_UINT*)allocate(64, local_nnz * sizeof(CG_UINT));
+    s->A.rowPtr = (CG_UINT*)allocate(64, (local_nrow + 1) * sizeof(CG_UINT));
+    s->x        = (CG_FLOAT*)allocate(64, local_nrow * sizeof(CG_FLOAT));
+    s->b        = (CG_FLOAT*)allocate(64, local_nrow * sizeof(CG_FLOAT));
+    s->xexact   = (CG_FLOAT*)allocate(64, local_nrow * sizeof(CG_FLOAT));
+
+    CG_FLOAT* curvalptr = s->A.val;
+    CG_UINT* curindptr  = s->A.colInd;
+    CG_UINT* currowptr  = s->A.rowPtr;
+    CG_FLOAT* x         = s->x;
+    CG_FLOAT* b         = s->b;
+    CG_FLOAT* xexact    = s->xexact;
+
+    CG_UINT nnzglobal = 0;
+    int nx = p->nx, ny = p->ny, nz = p->nz;
+    CG_UINT cursor = 0;
+
+    // for (int i = 0; i < local_nnz; i++) {
+    //     curvalptr[i] = 0.0;
+    //     printf("%d-%f, ", i, m->val[i]);
+    // }
+    // printf("\n");
+    *currowptr++ = 0;
+
+    for (int iz = 0; iz < nz; iz++) {
+        for (int iy = 0; iy < ny; iy++) {
+            for (int ix = 0; ix < nx; ix++) {
+
+                int curlocalrow = iz * nx * ny + iy * nx + ix;
+                int currow      = start_row + iz * nx * ny + iy * nx + ix;
+                int nnzrow      = 0;
+
+                // (*A)->ptr_to_vals_in_row[curlocalrow] = curvalptr;
+                // (*A)->ptr_to_inds_in_row[curlocalrow] = curindptr;
+
+                for (int sz = -1; sz <= 1; sz++) {
+                    for (int sy = -1; sy <= 1; sy++) {
+                        for (int sx = -1; sx <= 1; sx++) {
+
+                            int curcol = currow + sz * nx * ny + sy * nx + sx;
+                            //            Since we have a stack of nx by ny by nz domains
+                            //            , stacking in the z direction, we check to see
+                            //            if sx and sy are reaching outside of the domain,
+                            //            while the check for the curcol being valid is
+                            //            sufficient to check the z values
+                            if ((ix + sx >= 0) && (ix + sx < nx) && (iy + sy >= 0) &&
+                                (iy + sy < ny) && (curcol >= 0 && curcol < total_nrow)) {
+                                // This logic will skip over point that are not part of a
+                                // 7-pt stencil
+                                if (!use_7pt_stencil ||
+                                    (sz * sz + sy * sy + sx * sx <= 1)) {
+                                    if (curcol == currow) {
+                                        *curvalptr++ = 27.0;
+                                    } else {
+                                        *curvalptr++ = -1.0;
+                                    }
+                                    *curindptr++ = curcol;
+                                    nnzrow++;
+                                }
+                            }
+                        } // end sx loop
+                    } // end sy loop
+                } // end sz loop
+
+                *currowptr = *(currowptr - 1) + nnzrow;
+                // printf("%d:%d-%lld, ", currow, nnzrow, *currowptr);
+                currowptr++;
+                nnzglobal += nnzrow;
+                x[curlocalrow]      = 0.0;
+                b[curlocalrow]      = 27.0 - ((CG_FLOAT)(nnzrow - 1));
+                xexact[curlocalrow] = 1.0;
+            } // end ix loop
+        } // end iy loop
+    } // end iz loop
+
+#ifdef VERBOSE
+    printf("Process %d of %d has %d rows\n", rank, size, local_nrow);
+    printf("Global rows %d through %d\n", start_row, stop_row);
+    printf("%d nonzeros\n", start_row, stop_row);
+#endif /* ifdef VERBOSE */
+
+    // for (int i = 0; i < local_nnz; i++) {
+    //     printf("%d:%f, ", (int)m->colInd[i], m->val[i]);
+    // }
+    // printf("\n");
+
+    s->A.startRow = start_row;
+    s->A.stopRow  = stop_row;
+    s->A.totalNr  = total_nrow;
+    s->A.totalNnz = total_nnz;
+    s->A.nr       = local_nrow;
+    s->A.nc       = local_nrow;
+    s->A.nnz      = local_nnz;
+}
+
+void initSolver(Solver* s, Parameter* p)
+{
+    if (!strcmp(p->filename, "generate")) {
+        matrixGenerate(p, s, false);
+    } else if (!strcmp(p->filename, "generate7P")) {
+        matrixGenerate(p, s, true);
+    } else {
+        matrixRead(&s->A, p->filename);
+    }
+}
+
+void spMVM(Matrix* m, const CG_FLOAT* restrict x, CG_FLOAT* restrict y)
 {
    CG_UINT numRows = m->nr;
    CG_UINT* rowPtr = m->rowPtr;
@@ -26,3 +160,46 @@ void spMVM(Matrix* m, double* x, double* y)
        y[rowID] = tmp;
    }
 }
+
+void waxpby(const CG_UINT n,
+    const CG_FLOAT alpha,
+    const CG_FLOAT* restrict x,
+    const CG_FLOAT beta,
+    const CG_FLOAT* restrict y,
+    CG_FLOAT* const w)
+{
+    if (alpha == 1.0) {
+        for (size_t i = 0; i < n; i++) {
+            w[i] = x[i] + beta * y[i];
+        }
+    } else if (beta == 1.0) {
+        for (size_t i = 0; i < n; i++) {
+            w[i] = alpha * x[i] + y[i];
+        }
+    } else {
+        for (size_t i = 0; i < n; i++) {
+            w[i] = alpha * x[i] + beta * y[i];
+        }
+    }
+}
+
+void ddot(const CG_UINT n,
+    const CG_FLOAT* restrict x,
+    const CG_FLOAT* restrict y,
+    CG_FLOAT* restrict result)
+{
+    CG_FLOAT sum = 0.0;
+
+    if (y == x) {
+        for (size_t i = 0; i < n; i++) {
+            sum += x[i] * x[i];
+        }
+    } else {
+        for (size_t i = 0; i < n; i++) {
+            sum += x[i] * y[i];
+        }
+    }
+
+    commReduction(&sum, SUM);
+    *result = sum;
+}
--- a/src/solver.h
+++ b/src/solver.h
@@ -4,21 +4,29 @@
 * license that can be found in the LICENSE file. */
 #ifndef __SOLVER_H_
 #define __SOLVER_H_
-#include "comm.h"
 #include "matrix.h"
 #include "parameter.h"
 #include "util.h"

 typedef struct {
-    /* geometry and grid information */
-    /* parameters */
-    double eps, omega;
-    int itermax;
-
-    /* communication */
-    Comm* comm;
+    Matrix A;
+    CG_FLOAT* x;
+    CG_FLOAT* b;
+    CG_FLOAT* xexact;
 } Solver;

-void initSolver(Solver*, Parameter*);
-void spMVM(Matrix* m, CG_FLOAT* x, CG_FLOAT* y);
+void initSolver(Solver* s, Parameter*);
+void spMVM(Matrix* m, const CG_FLOAT* restrict x, CG_FLOAT* restrict y);
+
+void waxpby(const CG_UINT n,
+    const CG_FLOAT alpha,
+    const CG_FLOAT* restrict x,
+    const CG_FLOAT beta,
+    const CG_FLOAT* restrict y,
+    double* restrict w);
+
+void ddot(const CG_UINT n,
+    const CG_FLOAT* restrict e,
+    const CG_FLOAT* restrict y,
+    CG_FLOAT* restrict result);
 #endif // __SOLVER_H_