Cleanup and add MPI-IO starting point

2023-02-05 08:02:01 +01:00
parent 2f9270e2aa
commit f4e518ed76
44 changed files with 2614 additions and 1 deletions
--- a/BasicSolver/3D-mpi-io/Makefile
+++ b/BasicSolver/3D-mpi-io/Makefile
@@ -0,0 +1,71 @@
+#=======================================================================================
+# Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
+# All rights reserved.
+# Use of this source code is governed by a MIT-style
+# license that can be found in the LICENSE file.
+#=======================================================================================
+
+#CONFIGURE BUILD SYSTEM
+TARGET	   = exe-$(TAG)
+BUILD_DIR  = ./$(TAG)
+SRC_DIR    = ./src
+MAKE_DIR   = ./
+Q         ?= @
+
+#DO NOT EDIT BELOW
+include $(MAKE_DIR)/config.mk
+include $(MAKE_DIR)/include_$(TAG).mk
+INCLUDES  += -I$(SRC_DIR) -I$(BUILD_DIR)
+
+VPATH     = $(SRC_DIR)
+SRC       = $(wildcard $(SRC_DIR)/*.c)
+ASM       = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.s, $(SRC))
+OBJ       = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o, $(SRC))
+SOURCES   = $(SRC) $(wildcard $(SRC_DIR)/*.h)
+CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(OPTIONS) $(INCLUDES)
+
+${TARGET}: $(BUILD_DIR) $(OBJ)
+	$(info ===>  LINKING  $(TARGET))
+	$(Q)${LINKER} ${LFLAGS} -o $(TARGET) $(OBJ) $(LIBS)
+
+$(BUILD_DIR)/%.o:  %.c $(MAKE_DIR)/include_$(TAG).mk $(MAKE_DIR)/config.mk
+	$(info ===>  COMPILE  $@)
+	$(CC) -c $(CPPFLAGS) $(CFLAGS) $< -o $@
+	$(Q)$(GCC) $(CPPFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
+
+$(BUILD_DIR)/%.s:  %.c
+	$(info ===>  GENERATE ASM  $@)
+	$(CC) -S $(CPPFLAGS) $(CFLAGS) $< -o $@
+
+.PHONY: clean distclean tags info asm format
+
+clean:
+	$(info ===>  CLEAN)
+	@rm -rf $(BUILD_DIR)
+	@rm -f tags
+
+distclean: clean
+	$(info ===>  DIST CLEAN)
+	@rm -f $(TARGET)
+
+info:
+	$(info $(CFLAGS))
+	$(Q)$(CC) $(VERSION)
+
+asm:  $(BUILD_DIR) $(ASM)
+
+tags:
+	$(info ===>  GENERATE TAGS)
+	$(Q)ctags -R
+
+format:
+	@for src in $(SOURCES) ; do \
+		echo "Formatting $$src" ; \
+		clang-format -i $$src ; \
+	done
+	@echo "Done"
+
+$(BUILD_DIR):
+	@mkdir $(BUILD_DIR)
+
+-include $(OBJ:.o=.d)
--- a/BasicSolver/3D-mpi-io/README.md
+++ b/BasicSolver/3D-mpi-io/README.md
@@ -0,0 +1,78 @@
+# C source skeleton
+
+## Build
+
+1. Configure the toolchain and additional options in `config.mk`:
+```
+# Supported: GCC, CLANG, ICC
+TAG ?= GCC
+ENABLE_OPENMP ?= false
+
+OPTIONS +=  -DARRAY_ALIGNMENT=64
+#OPTIONS +=  -DVERBOSE
+#OPTIONS +=  -DVERBOSE_AFFINITY
+#OPTIONS +=  -DVERBOSE_DATASIZE
+#OPTIONS +=  -DVERBOSE_TIMER
+```
+
+The verbosity options enable detailed output about solver, affinity settings, allocation sizes and timer resolution.
+For debugging you may want to set the VERBOSE option:
+```
+# Supported: GCC, CLANG, ICC
+TAG ?= GCC
+ENABLE_OPENMP ?= false
+
+OPTIONS +=  -DARRAY_ALIGNMENT=64
+OPTIONS +=  -DVERBOSE
+#OPTIONS +=  -DVERBOSE_AFFINITY
+#OPTIONS +=  -DVERBOSE_DATASIZE
+#OPTIONS +=  -DVERBOSE_TIMER
+`
+
+2. Build with:
+```
+make
+```
+
+You can build multiple toolchains in the same directory, but notice that the Makefile is only acting on the one currently set.
+Intermediate build results are located in the `<TOOLCHAIN>` directory.
+
+To output the executed commands use:
+```
+make Q=
+```
+
+3. Clean up with:
+```
+make clean
+```
+to clean intermediate build results.
+
+```
+make distclean
+```
+to clean intermediate build results and binary.
+
+4. (Optional) Generate assembler:
+```
+make asm
+```
+The assembler files will also be located in the `<TOOLCHAIN>` directory.
+
+## Usage
+
+You have to provide a parameter file describing the problem you want to solve:
+```
+./exe-CLANG  dcavity.par
+```
+
+Examples are given in in dcavity (a lid driven cavity test case) and canal (simulating a empty canal).
+
+You can plot the resulting velocity and pressure fields using gnuplot:
+```
+gnuplot vector.plot
+```
+and for the pressure:
+```
+gnuplot surface.plot
+```
--- a/BasicSolver/3D-mpi-io/canal.par
+++ b/BasicSolver/3D-mpi-io/canal.par
@@ -0,0 +1,52 @@
+#==============================================================================
+#                            Laminar Canal Flow
+#==============================================================================
+
+# Problem specific Data:
+# ---------------------
+
+name canal             # name of flow setup
+
+bcLeft    3			#  flags for boundary conditions
+bcRight   3			#  1 = no-slip      3 = outflow
+bcBottom  1			#  2 = free-slip    4 = periodic
+bcTop     1			#
+bcFront   1			#
+bcBack    1			#
+
+gx     0.0      # Body forces (e.g. gravity)
+gy     0.0      #
+gz     0.0      #
+
+re            100.0	   # Reynolds number
+
+u_init        1.0      # initial value for velocity in x-direction
+v_init        0.0      # initial value for velocity in y-direction
+w_init        0.0      # initial value for velocity in z-direction
+p_init        0.0      # initial value for pressure
+
+# Geometry Data:
+# -------------
+
+xlength       30.0     # domain size in x-direction
+ylength       4.0	   # domain size in y-direction
+zlength       4.0	   # domain size in z-direction
+imax          200      # number of interior cells in x-direction
+jmax          50	   # number of interior cells in y-direction
+kmax          50	   # number of interior cells in z-direction
+
+# Time Data:
+# ---------
+
+te       100.0   # final time
+dt       0.02    # time stepsize
+tau      0.5     # safety factor for time stepsize control (<0 constant delt)
+
+# Pressure Iteration Data:
+# -----------------------
+
+itermax       500       # maximal number of pressure iteration in one time step
+eps           0.0001   # stopping tolerance for pressure iteration
+omg           1.3       # relaxation parameter for SOR iteration
+gamma         0.9       # upwind differencing factor gamma
+#===============================================================================
--- a/BasicSolver/3D-mpi-io/config.mk
+++ b/BasicSolver/3D-mpi-io/config.mk
@@ -0,0 +1,12 @@
+# Supported: GCC, CLANG, ICC
+TAG ?= ICC
+ENABLE_OPENMP ?= false
+
+#Feature options
+OPTIONS +=  -DARRAY_ALIGNMENT=64
+OPTIONS +=  -DVERBOSE
+#OPTIONS +=  -DDEBUG
+#OPTIONS +=  -DBOUNDCHECK
+#OPTIONS +=  -DVERBOSE_AFFINITY
+#OPTIONS +=  -DVERBOSE_DATASIZE
+#OPTIONS +=  -DVERBOSE_TIMER
--- a/BasicSolver/3D-mpi-io/dcavity.par
+++ b/BasicSolver/3D-mpi-io/dcavity.par
@@ -0,0 +1,52 @@
+#==============================================================================
+#                              Driven Cavity
+#==============================================================================
+
+# Problem specific Data:
+# ---------------------
+
+name dcavity        # name of flow setup
+
+bcLeft    1			#  flags for boundary conditions
+bcRight   1			#  1 = no-slip      3 = outflow
+bcBottom  1			#  2 = free-slip    4 = periodic
+bcTop     1			#
+bcFront   1			#
+bcBack    1			#
+
+gx    0.0			# Body forces (e.g. gravity)
+gy    0.0			#
+gz    0.0			#
+
+re    1000.0		    # Reynolds number
+
+u_init    0.0		# initial value for velocity in x-direction
+v_init    0.0		# initial value for velocity in y-direction
+w_init    0.0		# initial value for velocity in z-direction
+p_init    0.0		# initial value for pressure
+
+# Geometry Data:
+# -------------
+
+xlength    1.0		# domain size in x-direction
+ylength    1.0		# domain size in y-direction
+zlength    1.0		# domain size in z-direction
+imax       128		# number of interior cells in x-direction
+jmax       128		# number of interior cells in y-direction
+kmax       128		# number of interior cells in z-direction
+
+# Time Data:
+# ---------
+
+te       10.0		# final time
+dt       0.02	    # time stepsize
+tau      0.5		# safety factor for time stepsize control (<0 constant delt)
+
+# Pressure Iteration Data:
+# -----------------------
+
+itermax  1000		# maximal number of pressure iteration in one time step
+eps      0.001		# stopping tolerance for pressure iteration
+omg      1.8		# relaxation parameter for SOR iteration
+gamma    0.9		# upwind differencing factor gamma
+#===============================================================================
--- a/BasicSolver/3D-mpi-io/include_CLANG.mk
+++ b/BasicSolver/3D-mpi-io/include_CLANG.mk
@@ -0,0 +1,17 @@
+CC   = mpicc
+GCC  = cc
+LINKER = $(CC)
+
+ifeq ($(ENABLE_OPENMP),true)
+OPENMP   = -fopenmp
+#OPENMP   = -Xpreprocessor -fopenmp #required on Macos with homebrew libomp
+LIBS     = # -lomp
+endif
+
+VERSION  = --version
+# CFLAGS   = -O3 -std=c17 $(OPENMP)
+CFLAGS   = -Ofast -std=c17
+#CFLAGS   = -Ofast -fnt-store=aggressive  -std=c99 $(OPENMP) #AMD CLANG
+LFLAGS   = $(OPENMP) -lm
+DEFINES  = -D_GNU_SOURCE# -DDEBUG
+INCLUDES =
--- a/BasicSolver/3D-mpi-io/include_GCC.mk
+++ b/BasicSolver/3D-mpi-io/include_GCC.mk
@@ -0,0 +1,14 @@
+CC   = gcc
+GCC  = gcc
+LINKER = $(CC)
+
+ifeq ($(ENABLE_OPENMP),true)
+OPENMP   = -fopenmp
+endif
+
+VERSION  = --version
+CFLAGS   = -Ofast -ffreestanding -std=c99 $(OPENMP)
+LFLAGS   = $(OPENMP)
+DEFINES  = -D_GNU_SOURCE
+INCLUDES =
+LIBS     =
--- a/BasicSolver/3D-mpi-io/include_ICC.mk
+++ b/BasicSolver/3D-mpi-io/include_ICC.mk
@@ -0,0 +1,14 @@
+CC   = icc
+GCC  = gcc
+LINKER = $(CC)
+
+ifeq ($(ENABLE_OPENMP),true)
+OPENMP   = -qopenmp
+endif
+
+VERSION  = --version
+CFLAGS   =  -O3 -xHost -qopt-zmm-usage=high -std=c99 $(OPENMP)
+LFLAGS   = $(OPENMP)
+DEFINES  = -D_GNU_SOURCE
+INCLUDES =
+LIBS     =
--- a/BasicSolver/3D-mpi-io/src/allocate.c
+++ b/BasicSolver/3D-mpi-io/src/allocate.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved.
+ * Use of this source code is governed by a MIT-style
+ * license that can be found in the LICENSE file.
+ */
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+void* allocate(int alignment, size_t bytesize)
+{
+    int errorCode;
+    void* ptr;
+
+    errorCode = posix_memalign(&ptr, alignment, bytesize);
+
+    if (errorCode) {
+        if (errorCode == EINVAL) {
+            fprintf(stderr, "Error: Alignment parameter is not a power of two\n");
+            exit(EXIT_FAILURE);
+        }
+        if (errorCode == ENOMEM) {
+            fprintf(stderr, "Error: Insufficient memory to fulfill the request\n");
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    if (ptr == NULL) {
+        fprintf(stderr, "Error: posix_memalign failed!\n");
+        exit(EXIT_FAILURE);
+    }
+
+    return ptr;
+}
--- a/BasicSolver/3D-mpi-io/src/allocate.h
+++ b/BasicSolver/3D-mpi-io/src/allocate.h
@@ -0,0 +1,13 @@
+/*
+ * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved.
+ * Use of this source code is governed by a MIT-style
+ * license that can be found in the LICENSE file.
+ */
+#ifndef __ALLOCATE_H_
+#define __ALLOCATE_H_
+#include <stdlib.h>
+
+extern void* allocate(int alignment, size_t bytesize);
+
+#endif
--- a/BasicSolver/3D-mpi-io/src/comm.c
+++ b/BasicSolver/3D-mpi-io/src/comm.c
@@ -0,0 +1,503 @@
+/*
+ * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of nusif-solver.
+ * Use of this source code is governed by a MIT style
+ * license that can be found in the LICENSE file.
+ */
+#include <mpi.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "allocate.h"
+#include "comm.h"
+
+// subroutines local to this module
+static int sizeOfRank(int rank, int size, int N)
+{
+    return N / size + ((N % size > rank) ? 1 : 0);
+}
+
+static void setupCommunication(Comm* c, Direction direction, int layer)
+{
+    int imaxLocal = c->imaxLocal;
+    int jmaxLocal = c->jmaxLocal;
+    int kmaxLocal = c->kmaxLocal;
+
+    size_t dblsize = sizeof(double);
+    int sizes[NDIMS];
+    int subSizes[NDIMS];
+    int starts[NDIMS];
+    int offset = 0;
+
+    sizes[IDIM] = imaxLocal + 2;
+    sizes[JDIM] = jmaxLocal + 2;
+    sizes[KDIM] = kmaxLocal + 2;
+
+    if (layer == HALO) {
+        offset = 1;
+    }
+
+    switch (direction) {
+    case LEFT:
+        subSizes[IDIM] = 1;
+        subSizes[JDIM] = jmaxLocal;
+        subSizes[KDIM] = kmaxLocal;
+        starts[IDIM]   = 1 - offset;
+        starts[JDIM]   = 1;
+        starts[KDIM]   = 1;
+        break;
+    case RIGHT:
+        subSizes[IDIM] = 1;
+        subSizes[JDIM] = jmaxLocal;
+        subSizes[KDIM] = kmaxLocal;
+        starts[IDIM]   = imaxLocal + offset;
+        starts[JDIM]   = 1;
+        starts[KDIM]   = 1;
+        break;
+    case BOTTOM:
+        subSizes[IDIM] = imaxLocal;
+        subSizes[JDIM] = 1;
+        subSizes[KDIM] = kmaxLocal;
+        starts[IDIM]   = 1;
+        starts[JDIM]   = 1 - offset;
+        starts[KDIM]   = 1;
+        break;
+    case TOP:
+        subSizes[IDIM] = imaxLocal;
+        subSizes[JDIM] = 1;
+        subSizes[KDIM] = kmaxLocal;
+        starts[IDIM]   = 1;
+        starts[JDIM]   = jmaxLocal + offset;
+        starts[KDIM]   = 1;
+        break;
+    case FRONT:
+        subSizes[IDIM] = imaxLocal;
+        subSizes[JDIM] = jmaxLocal;
+        subSizes[KDIM] = 1;
+        starts[IDIM]   = 1;
+        starts[JDIM]   = 1;
+        starts[KDIM]   = 1 - offset;
+        break;
+    case BACK:
+        subSizes[IDIM] = imaxLocal;
+        subSizes[JDIM] = jmaxLocal;
+        subSizes[KDIM] = 1;
+        starts[IDIM]   = 1;
+        starts[JDIM]   = 1;
+        starts[KDIM]   = kmaxLocal + offset;
+        break;
+    case NDIRS:
+        printf("ERROR!\n");
+        break;
+    }
+
+    if (layer == HALO) {
+        MPI_Type_create_subarray(NDIMS,
+            sizes,
+            subSizes,
+            starts,
+            MPI_ORDER_C,
+            MPI_DOUBLE,
+            &c->rbufferTypes[direction]);
+        MPI_Type_commit(&c->rbufferTypes[direction]);
+    } else if (layer == BULK) {
+        MPI_Type_create_subarray(NDIMS,
+            sizes,
+            subSizes,
+            starts,
+            MPI_ORDER_C,
+            MPI_DOUBLE,
+            &c->sbufferTypes[direction]);
+        MPI_Type_commit(&c->sbufferTypes[direction]);
+    }
+}
+
+static void assembleResult(Comm* c,
+    double* src,
+    double* dst,
+    int imaxLocal[],
+    int jmaxLocal[],
+    int kmaxLocal[],
+    int offset[],
+    int kmax,
+    int jmax,
+    int imax)
+{
+    int numRequests = 1;
+
+    if (c->rank == 0) {
+        numRequests = c->size + 1;
+    }
+
+    MPI_Request requests[numRequests];
+
+    /* all ranks send their interpolated bulk array */
+    MPI_Isend(src,
+        c->imaxLocal * c->jmaxLocal * c->kmaxLocal,
+        MPI_DOUBLE,
+        0,
+        0,
+        c->comm,
+        &requests[0]);
+
+    /* rank 0 assembles the subdomains */
+    if (c->rank == 0) {
+        for (int i = 0; i < c->size; i++) {
+            MPI_Datatype domainType;
+            int oldSizes[NDIMS] = { kmax, jmax, imax };
+            int newSizes[NDIMS] = { kmaxLocal[i], jmaxLocal[i], imaxLocal[i] };
+            int starts[NDIMS]   = { offset[i * NDIMS + KDIM],
+                  offset[i * NDIMS + JDIM],
+                  offset[i * NDIMS + IDIM] };
+            MPI_Type_create_subarray(NDIMS,
+                oldSizes,
+                newSizes,
+                starts,
+                MPI_ORDER_C,
+                MPI_DOUBLE,
+                &domainType);
+            MPI_Type_commit(&domainType);
+
+            MPI_Irecv(dst, 1, domainType, i, 0, c->comm, &requests[i + 1]);
+            MPI_Type_free(&domainType);
+        }
+    }
+
+    MPI_Waitall(numRequests, requests, MPI_STATUSES_IGNORE);
+}
+
+static int sum(int* sizes, int position)
+{
+    int sum = 0;
+
+    for (int i = 0; i < position; i++) {
+        sum += sizes[i];
+    }
+
+    return sum;
+}
+
+// exported subroutines
+void commReduction(double* v, int op)
+{
+    if (op == MAX) {
+        MPI_Allreduce(MPI_IN_PLACE, v, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+    } else if (op == SUM) {
+        MPI_Allreduce(MPI_IN_PLACE, v, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    }
+}
+
+int commIsBoundary(Comm* c, Direction direction)
+{
+    switch (direction) {
+    case LEFT:
+        return c->coords[ICORD] == 0;
+        break;
+    case RIGHT:
+        return c->coords[ICORD] == (c->dims[ICORD] - 1);
+        break;
+    case BOTTOM:
+        return c->coords[JCORD] == 0;
+        break;
+    case TOP:
+        return c->coords[JCORD] == (c->dims[JCORD] - 1);
+        break;
+    case FRONT:
+        return c->coords[KCORD] == 0;
+        break;
+    case BACK:
+        return c->coords[KCORD] == (c->dims[KCORD] - 1);
+        break;
+    case NDIRS:
+        printf("ERROR!\n");
+        break;
+    }
+
+    return 0;
+}
+
+void commExchange(Comm* c, double* grid)
+{
+    int counts[6]      = { 1, 1, 1, 1, 1, 1 };
+    MPI_Aint displs[6] = { 0, 0, 0, 0, 0, 0 };
+
+    MPI_Neighbor_alltoallw(grid,
+        counts,
+        displs,
+        c->sbufferTypes,
+        grid,
+        counts,
+        displs,
+        c->rbufferTypes,
+        c->comm);
+}
+
+void commShift(Comm* c, double* f, double* g, double* h)
+{
+    MPI_Request requests[6] = { MPI_REQUEST_NULL,
+        MPI_REQUEST_NULL,
+        MPI_REQUEST_NULL,
+        MPI_REQUEST_NULL,
+        MPI_REQUEST_NULL,
+        MPI_REQUEST_NULL };
+
+    /* shift G */
+    /* receive ghost cells from bottom neighbor */
+    MPI_Irecv(g,
+        1,
+        c->rbufferTypes[BOTTOM],
+        c->neighbours[BOTTOM],
+        0,
+        c->comm,
+        &requests[0]);
+
+    /* send ghost cells to top neighbor */
+    MPI_Isend(g, 1, c->sbufferTypes[TOP], c->neighbours[TOP], 0, c->comm, &requests[1]);
+
+    /* shift F */
+    /* receive ghost cells from left neighbor */
+    MPI_Irecv(f, 1, c->rbufferTypes[LEFT], c->neighbours[LEFT], 1, c->comm, &requests[2]);
+
+    /* send ghost cells to right neighbor */
+    MPI_Isend(f,
+        1,
+        c->sbufferTypes[RIGHT],
+        c->neighbours[RIGHT],
+        1,
+        c->comm,
+        &requests[3]);
+
+    /* shift H */
+    /* receive ghost cells from front neighbor */
+    MPI_Irecv(h,
+        1,
+        c->rbufferTypes[FRONT],
+        c->neighbours[FRONT],
+        2,
+        c->comm,
+        &requests[4]);
+
+    /* send ghost cells to back neighbor */
+    MPI_Isend(h, 1, c->sbufferTypes[BACK], c->neighbours[BACK], 2, c->comm, &requests[5]);
+
+    MPI_Waitall(6, requests, MPI_STATUSES_IGNORE);
+}
+
+#define G(v, i, j, k)                                                                    \
+    v[(k) * (imaxLocal + 2) * (jmaxLocal + 2) + (j) * (imaxLocal + 2) + (i)]
+
+void commCollectResult(Comm* c,
+    double* ug,
+    double* vg,
+    double* wg,
+    double* pg,
+    double* u,
+    double* v,
+    double* w,
+    double* p,
+    int kmax,
+    int jmax,
+    int imax)
+{
+    int imaxLocal = c->imaxLocal;
+    int jmaxLocal = c->jmaxLocal;
+    int kmaxLocal = c->kmaxLocal;
+
+    int offset[c->size * NDIMS];
+    int imaxLocalAll[c->size];
+    int jmaxLocalAll[c->size];
+    int kmaxLocalAll[c->size];
+
+    MPI_Gather(&imaxLocal, 1, MPI_INT, imaxLocalAll, 1, MPI_INT, 0, MPI_COMM_WORLD);
+    MPI_Gather(&jmaxLocal, 1, MPI_INT, jmaxLocalAll, 1, MPI_INT, 0, MPI_COMM_WORLD);
+    MPI_Gather(&kmaxLocal, 1, MPI_INT, kmaxLocalAll, 1, MPI_INT, 0, MPI_COMM_WORLD);
+
+    if (c->rank == 0) {
+        for (int i = 0; i < c->size; i++) {
+            int coords[NCORDS];
+            MPI_Cart_coords(c->comm, i, NDIMS, coords);
+            offset[i * NDIMS + IDIM] = sum(imaxLocalAll, coords[ICORD]);
+            offset[i * NDIMS + JDIM] = sum(jmaxLocalAll, coords[JCORD]);
+            offset[i * NDIMS + KDIM] = sum(kmaxLocalAll, coords[KCORD]);
+            printf("Rank: %d, Coords(k,j,i): %d %d %d, Size(k,j,i): %d %d %d, "
+                   "Offset(k,j,i): %d %d %d\n",
+                i,
+                coords[KCORD],
+                coords[JCORD],
+                coords[ICORD],
+                kmaxLocalAll[i],
+                jmaxLocalAll[i],
+                imaxLocalAll[i],
+                offset[i * NDIMS + KDIM],
+                offset[i * NDIMS + JDIM],
+                offset[i * NDIMS + IDIM]);
+        }
+    }
+
+    size_t bytesize = imaxLocal * jmaxLocal * kmaxLocal * sizeof(double);
+    double* tmp     = allocate(64, bytesize);
+    int idx         = 0;
+
+    /* collect P */
+    for (int k = 1; k < kmaxLocal + 1; k++) {
+        for (int j = 1; j < jmaxLocal + 1; j++) {
+            for (int i = 1; i < imaxLocal + 1; i++) {
+                tmp[idx++] = G(p, i, j, k);
+            }
+        }
+    }
+
+    assembleResult(c,
+        tmp,
+        pg,
+        imaxLocalAll,
+        jmaxLocalAll,
+        kmaxLocalAll,
+        offset,
+        kmax,
+        jmax,
+        imax);
+
+    /* collect U */
+    idx = 0;
+
+    for (int k = 1; k < kmaxLocal + 1; k++) {
+        for (int j = 1; j < jmaxLocal + 1; j++) {
+            for (int i = 1; i < imaxLocal + 1; i++) {
+                tmp[idx++] = (G(u, i, j, k) + G(u, i - 1, j, k)) / 2.0;
+            }
+        }
+    }
+
+    assembleResult(c,
+        tmp,
+        ug,
+        imaxLocalAll,
+        jmaxLocalAll,
+        kmaxLocalAll,
+        offset,
+        kmax,
+        jmax,
+        imax);
+
+    /* collect V */
+    idx = 0;
+
+    for (int k = 1; k < kmaxLocal + 1; k++) {
+        for (int j = 1; j < jmaxLocal + 1; j++) {
+            for (int i = 1; i < imaxLocal + 1; i++) {
+                tmp[idx++] = (G(v, i, j, k) + G(v, i, j - 1, k)) / 2.0;
+            }
+        }
+    }
+
+    assembleResult(c,
+        tmp,
+        vg,
+        imaxLocalAll,
+        jmaxLocalAll,
+        kmaxLocalAll,
+        offset,
+        kmax,
+        jmax,
+        imax);
+
+    /* collect W */
+    idx = 0;
+
+    for (int k = 1; k < kmaxLocal + 1; k++) {
+        for (int j = 1; j < jmaxLocal + 1; j++) {
+            for (int i = 1; i < imaxLocal + 1; i++) {
+                tmp[idx++] = (G(w, i, j, k) + G(w, i, j, k - 1)) / 2.0;
+            }
+        }
+    }
+
+    assembleResult(c,
+        tmp,
+        wg,
+        imaxLocalAll,
+        jmaxLocalAll,
+        kmaxLocalAll,
+        offset,
+        kmax,
+        jmax,
+        imax);
+
+    free(tmp);
+}
+
+void commPrintConfig(Comm* c)
+{
+    fflush(stdout);
+    MPI_Barrier(MPI_COMM_WORLD);
+    if (commIsMaster(c)) {
+        printf("Communication setup:\n");
+    }
+
+    for (int i = 0; i < c->size; i++) {
+        if (i == c->rank) {
+            printf("\tRank %d of %d\n", c->rank, c->size);
+            printf("\tNeighbours (front, back, bottom, top, left, right): %d, %d, %d, "
+                   "%d, %d, %d\n",
+                c->neighbours[FRONT],
+                c->neighbours[BACK],
+                c->neighbours[BOTTOM],
+                c->neighbours[TOP],
+                c->neighbours[LEFT],
+                c->neighbours[RIGHT]);
+            printf("\tCoordinates (k,j,i) %d %d %d\n",
+                c->coords[KCORD],
+                c->coords[JCORD],
+                c->coords[ICORD]);
+            printf("\tLocal domain size (k,j,i) %dx%dx%d\n",
+                c->kmaxLocal,
+                c->jmaxLocal,
+                c->imaxLocal);
+            fflush(stdout);
+        }
+    }
+    MPI_Barrier(MPI_COMM_WORLD);
+}
+
+void commInit(Comm* c, int kmax, int jmax, int imax)
+{
+    /* setup communication */
+    MPI_Comm_rank(MPI_COMM_WORLD, &(c->rank));
+    MPI_Comm_size(MPI_COMM_WORLD, &(c->size));
+    int dims[NDIMS]    = { 0, 0, 0 };
+    int periods[NDIMS] = { 0, 0, 0 };
+    MPI_Dims_create(c->size, NDIMS, dims);
+    MPI_Cart_create(MPI_COMM_WORLD, NCORDS, dims, periods, 0, &c->comm);
+    MPI_Cart_shift(c->comm, ICORD, 1, &c->neighbours[LEFT], &c->neighbours[RIGHT]);
+    MPI_Cart_shift(c->comm, JCORD, 1, &c->neighbours[BOTTOM], &c->neighbours[TOP]);
+    MPI_Cart_shift(c->comm, KCORD, 1, &c->neighbours[FRONT], &c->neighbours[BACK]);
+    MPI_Cart_get(c->comm, NCORDS, c->dims, periods, c->coords);
+
+    c->imaxLocal = sizeOfRank(c->rank, dims[ICORD], imax);
+    c->jmaxLocal = sizeOfRank(c->rank, dims[JCORD], jmax);
+    c->kmaxLocal = sizeOfRank(c->rank, dims[KCORD], kmax);
+
+    // setup buffer types for communication
+    setupCommunication(c, LEFT, BULK);
+    setupCommunication(c, LEFT, HALO);
+    setupCommunication(c, RIGHT, BULK);
+    setupCommunication(c, RIGHT, HALO);
+    setupCommunication(c, BOTTOM, BULK);
+    setupCommunication(c, BOTTOM, HALO);
+    setupCommunication(c, TOP, BULK);
+    setupCommunication(c, TOP, HALO);
+    setupCommunication(c, FRONT, BULK);
+    setupCommunication(c, FRONT, HALO);
+    setupCommunication(c, BACK, BULK);
+    setupCommunication(c, BACK, HALO);
+}
+
+void commFree(Comm* c)
+{
+    for (int i = 0; i < NDIRS; i++) {
+        MPI_Type_free(&c->sbufferTypes[i]);
+        MPI_Type_free(&c->rbufferTypes[i]);
+    }
+}
--- a/BasicSolver/3D-mpi-io/src/comm.h
+++ b/BasicSolver/3D-mpi-io/src/comm.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of nusif-solver.
+ * Use of this source code is governed by a MIT style
+ * license that can be found in the LICENSE file.
+ */
+#ifndef __COMM_H_
+#define __COMM_H_
+#include <mpi.h>
+/*
+ * Spatial directions:
+ * ICORD (0) from 0 (LEFT) to imax (RIGHT)
+ * JCORD (1) from 0 (BOTTOM) to jmax (TOP)
+ * KCORD (2) from 0 (FRONT) to kmax (BACK)
+ * All derived Subarray types are in C ordering
+ * with indices KDIM (0), JDIM(1) and IDIM(2)
+ * */
+typedef enum direction { LEFT = 0, RIGHT, BOTTOM, TOP, FRONT, BACK, NDIRS } Direction;
+typedef enum coordinates { ICORD = 0, JCORD, KCORD, NCORDS } Coordinates;
+typedef enum dimension { KDIM = 0, JDIM, IDIM, NDIMS } Dimension;
+enum layer { HALO = 0, BULK };
+enum op { MAX = 0, SUM };
+
+typedef struct {
+    int rank;
+    int size;
+    MPI_Comm comm;
+    MPI_Datatype sbufferTypes[NDIRS];
+    MPI_Datatype rbufferTypes[NDIRS];
+    int neighbours[NDIRS];
+    int coords[NDIMS], dims[NDIMS];
+    int imaxLocal, jmaxLocal, kmaxLocal;
+    MPI_File fh;
+} Comm;
+
+extern void commInit(Comm* comm, int kmax, int jmax, int imax);
+extern void commFree(Comm* comm);
+extern void commPrintConfig(Comm*);
+extern void commExchange(Comm*, double*);
+extern void commShift(Comm* c, double* f, double* g, double* h);
+extern void commReduction(double* v, int op);
+extern int commIsBoundary(Comm* c, Direction direction);
+extern void commCollectResult(Comm* c,
+    double* ug,
+    double* vg,
+    double* wg,
+    double* pg,
+    double* u,
+    double* v,
+    double* w,
+    double* p,
+    int kmax,
+    int jmax,
+    int imax);
+
+static inline int commIsMaster(Comm* c) { return c->rank == 0; }
+#endif // __COMM_H_
--- a/BasicSolver/3D-mpi-io/src/grid.h
+++ b/BasicSolver/3D-mpi-io/src/grid.h
@@ -0,0 +1,16 @@
+/*
+ * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of nusif-solver.
+ * Use of this source code is governed by a MIT style
+ * license that can be found in the LICENSE file.
+ */
+#ifndef __GRID_H_
+#define __GRID_H_
+
+typedef struct {
+    double dx, dy, dz;
+    int imax, jmax, kmax;
+    double xlength, ylength, zlength;
+} Grid;
+
+#endif // __GRID_H_
--- a/BasicSolver/3D-mpi-io/src/likwid-marker.h
+++ b/BasicSolver/3D-mpi-io/src/likwid-marker.h
@@ -0,0 +1,54 @@
+/*
+ * =======================================================================================
+ *
+ *      Author:   Jan Eitzinger (je), jan.eitzinger@fau.de
+ *      Copyright (c) 2020 RRZE, University Erlangen-Nuremberg
+ *
+ *      Permission is hereby granted, free of charge, to any person obtaining a copy
+ *      of this software and associated documentation files (the "Software"), to deal
+ *      in the Software without restriction, including without limitation the rights
+ *      to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ *      copies of the Software, and to permit persons to whom the Software is
+ *      furnished to do so, subject to the following conditions:
+ *
+ *      The above copyright notice and this permission notice shall be included in all
+ *      copies or substantial portions of the Software.
+ *
+ *      THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ *      IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ *      FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ *      AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ *      LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ *      OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ *      SOFTWARE.
+ *
+ * =======================================================================================
+ */
+#ifndef LIKWID_MARKERS_H
+#define LIKWID_MARKERS_H
+
+#ifdef LIKWID_PERFMON
+#include <likwid.h>
+#define LIKWID_MARKER_INIT                likwid_markerInit()
+#define LIKWID_MARKER_THREADINIT          likwid_markerThreadInit()
+#define LIKWID_MARKER_SWITCH              likwid_markerNextGroup()
+#define LIKWID_MARKER_REGISTER(regionTag) likwid_markerRegisterRegion(regionTag)
+#define LIKWID_MARKER_START(regionTag)    likwid_markerStartRegion(regionTag)
+#define LIKWID_MARKER_STOP(regionTag)     likwid_markerStopRegion(regionTag)
+#define LIKWID_MARKER_CLOSE               likwid_markerClose()
+#define LIKWID_MARKER_RESET(regionTag)    likwid_markerResetRegion(regionTag)
+#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count)                       \
+    likwid_markerGetRegion(regionTag, nevents, events, time, count)
+#else /* LIKWID_PERFMON */
+#define LIKWID_MARKER_INIT
+#define LIKWID_MARKER_THREADINIT
+#define LIKWID_MARKER_SWITCH
+#define LIKWID_MARKER_REGISTER(regionTag)
+#define LIKWID_MARKER_START(regionTag)
+#define LIKWID_MARKER_STOP(regionTag)
+#define LIKWID_MARKER_CLOSE
+#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count)
+#define LIKWID_MARKER_RESET(regionTag)
+#endif /* LIKWID_PERFMON */
+
+#endif /*LIKWID_MARKERS_H*/
--- a/BasicSolver/3D-mpi-io/src/main.c
+++ b/BasicSolver/3D-mpi-io/src/main.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved.
+ * Use of this source code is governed by a MIT-style
+ * license that can be found in the LICENSE file.
+ */
+#include <float.h>
+#include <limits.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "allocate.h"
+#include "parameter.h"
+#include "progress.h"
+#include "solver.h"
+#include "test.h"
+#include "timing.h"
+#include "vtkWriter.h"
+
+int main(int argc, char** argv)
+{
+    int rank;
+    double timeStart, timeStop;
+    Parameter params;
+    Solver solver;
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    initParameter(&params);
+
+    if (argc != 2) {
+        printf("Usage: %s <configFile>\n", argv[0]);
+        exit(EXIT_SUCCESS);
+    }
+
+    readParameter(&params, argv[1]);
+    if (commIsMaster(&solver.comm)) {
+        printParameter(&params);
+    }
+    initSolver(&solver, &params);
+#ifndef VERBOSE
+    initProgress(solver.te);
+#endif
+
+    double tau = solver.tau;
+    double te  = solver.te;
+    double t   = 0.0;
+    int nt     = 0;
+
+    timeStart = getTimeStamp();
+    while (t <= te) {
+        if (tau > 0.0) computeTimestep(&solver);
+        setBoundaryConditions(&solver);
+        setSpecialBoundaryCondition(&solver);
+        computeFG(&solver);
+        computeRHS(&solver);
+        // if (nt % 100 == 0) normalizePressure(&solver);
+        solve(&solver);
+        adaptUV(&solver);
+        t += solver.dt;
+        nt++;
+
+#ifdef VERBOSE
+        if (commIsMaster(&solver.comm)) {
+            printf("TIME %f , TIMESTEP %f\n", t, solver.dt);
+        }
+#else
+        printProgress(t);
+#endif
+    }
+    timeStop = getTimeStamp();
+    stopProgress();
+    if (commIsMaster(&solver.comm)) {
+        printf("Solution took %.2fs\n", timeStop - timeStart);
+    }
+
+    // testInit(&solver);
+    // commExchange(&solver.comm, solver.p);
+    // testPrintHalo(&solver, solver.p);
+
+    double *pg, *ug, *vg, *wg;
+
+    if (commIsMaster(&solver.comm)) {
+        size_t bytesize = solver.grid.imax * solver.grid.jmax * solver.grid.kmax *
+                          sizeof(double);
+
+        pg = allocate(64, bytesize);
+        ug = allocate(64, bytesize);
+        vg = allocate(64, bytesize);
+        wg = allocate(64, bytesize);
+    }
+
+    commCollectResult(&solver.comm,
+        ug,
+        vg,
+        wg,
+        pg,
+        solver.u,
+        solver.v,
+        solver.w,
+        solver.p,
+        solver.grid.kmax,
+        solver.grid.jmax,
+        solver.grid.imax);
+
+    VtkOptions opts = { .grid = solver.grid, .comm = solver.comm };
+    vtkOpen(&opts, solver.problem);
+    vtkScalar(&opts, "pressure", pg);
+    vtkVector(&opts, "velocity", (VtkVector) { ug, vg, wg });
+    vtkClose(&opts);
+
+    commFree(&solver.comm);
+    MPI_Finalize();
+    return EXIT_SUCCESS;
+}
--- a/BasicSolver/3D-mpi-io/src/parameter.c
+++ b/BasicSolver/3D-mpi-io/src/parameter.c
@@ -0,0 +1,126 @@
+/*
+ * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of nusif-solver.
+ * Use of this source code is governed by a MIT style
+ * license that can be found in the LICENSE file.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "parameter.h"
+#include "util.h"
+#define MAXLINE 4096
+
+void initParameter(Parameter* param)
+{
+    param->xlength = 1.0;
+    param->ylength = 1.0;
+    param->zlength = 1.0;
+    param->imax    = 100;
+    param->jmax    = 100;
+    param->kmax    = 100;
+    param->itermax = 1000;
+    param->eps     = 0.0001;
+    param->omg     = 1.7;
+    param->re      = 100.0;
+    param->gamma   = 0.9;
+    param->tau     = 0.5;
+}
+
+void readParameter(Parameter* param, const char* filename)
+{
+    FILE* fp = fopen(filename, "r");
+    char line[MAXLINE];
+    int i;
+
+    if (!fp) {
+        fprintf(stderr, "Could not open parameter file: %s\n", filename);
+        exit(EXIT_FAILURE);
+    }
+
+    while (!feof(fp)) {
+        line[0] = '\0';
+        fgets(line, MAXLINE, fp);
+        for (i = 0; line[i] != '\0' && line[i] != '#'; i++)
+            ;
+        line[i] = '\0';
+
+        char* tok = strtok(line, " ");
+        char* val = strtok(NULL, " ");
+
+#define PARSE_PARAM(p, f)                                                                \
+    if (strncmp(tok, #p, sizeof(#p) / sizeof(#p[0]) - 1) == 0) {                         \
+        param->p = f(val);                                                               \
+    }
+#define PARSE_STRING(p) PARSE_PARAM(p, strdup)
+#define PARSE_INT(p)    PARSE_PARAM(p, atoi)
+#define PARSE_REAL(p)   PARSE_PARAM(p, atof)
+
+        if (tok != NULL && val != NULL) {
+            PARSE_REAL(xlength);
+            PARSE_REAL(ylength);
+            PARSE_REAL(zlength);
+            PARSE_INT(imax);
+            PARSE_INT(jmax);
+            PARSE_INT(kmax);
+            PARSE_INT(itermax);
+            PARSE_REAL(eps);
+            PARSE_REAL(omg);
+            PARSE_REAL(re);
+            PARSE_REAL(tau);
+            PARSE_REAL(gamma);
+            PARSE_REAL(dt);
+            PARSE_REAL(te);
+            PARSE_REAL(gx);
+            PARSE_REAL(gy);
+            PARSE_REAL(gz);
+            PARSE_STRING(name);
+            PARSE_INT(bcLeft);
+            PARSE_INT(bcRight);
+            PARSE_INT(bcBottom);
+            PARSE_INT(bcTop);
+            PARSE_INT(bcFront);
+            PARSE_INT(bcBack);
+            PARSE_REAL(u_init);
+            PARSE_REAL(v_init);
+            PARSE_REAL(w_init);
+            PARSE_REAL(p_init);
+        }
+    }
+
+    fclose(fp);
+}
+
+void printParameter(Parameter* param)
+{
+    printf("Parameters for %s\n", param->name);
+    printf("Boundary conditions Left:%d Right:%d Bottom:%d Top:%d Front:%d "
+           "Back:%d\n",
+        param->bcLeft,
+        param->bcRight,
+        param->bcBottom,
+        param->bcTop,
+        param->bcFront,
+        param->bcBack);
+    printf("\tReynolds number: %.2f\n", param->re);
+    printf("\tInit arrays: U:%.2f V:%.2f W:%.2f P:%.2f\n",
+        param->u_init,
+        param->v_init,
+        param->w_init,
+        param->p_init);
+    printf("Geometry data:\n");
+    printf("\tDomain box size (x, y, z): %.2f, %.2f, %.2f\n",
+        param->xlength,
+        param->ylength,
+        param->zlength);
+    printf("\tCells (x, y, z): %d, %d, %d\n", param->imax, param->jmax, param->kmax);
+    printf("Timestep parameters:\n");
+    printf("\tDefault stepsize: %.2f, Final time %.2f\n", param->dt, param->te);
+    printf("\tTau factor: %.2f\n", param->tau);
+    printf("Iterative solver parameters:\n");
+    printf("\tMax iterations: %d\n", param->itermax);
+    printf("\tepsilon (stopping tolerance) : %f\n", param->eps);
+    printf("\tgamma (stopping tolerance) : %f\n", param->gamma);
+    printf("\tomega (SOR relaxation): %f\n", param->omg);
+}
--- a/BasicSolver/3D-mpi-io/src/parameter.h
+++ b/BasicSolver/3D-mpi-io/src/parameter.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of nusif-solver.
+ * Use of this source code is governed by a MIT style
+ * license that can be found in the LICENSE file.
+ */
+#ifndef __PARAMETER_H_
+#define __PARAMETER_H_
+
+typedef struct {
+    int imax, jmax, kmax;
+    double xlength, ylength, zlength;
+    int itermax;
+    double eps, omg;
+    double re, tau, gamma;
+    double te, dt;
+    double gx, gy, gz;
+    char* name;
+    int bcLeft, bcRight, bcBottom, bcTop, bcFront, bcBack;
+    double u_init, v_init, w_init, p_init;
+} Parameter;
+
+void initParameter(Parameter*);
+void readParameter(Parameter*, const char*);
+void printParameter(Parameter*);
+#endif
--- a/BasicSolver/3D-mpi-io/src/progress.c
+++ b/BasicSolver/3D-mpi-io/src/progress.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of nusif-solver.
+ * Use of this source code is governed by a MIT style
+ * license that can be found in the LICENSE file.
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "progress.h"
+
+static double _end;
+static int _current;
+
+void initProgress(double end)
+{
+    _end     = end;
+    _current = 0;
+
+    printf("[          ]");
+    fflush(stdout);
+}
+
+void printProgress(double current)
+{
+    int new = (int)rint((current / _end) * 10.0);
+
+    if (new > _current) {
+        char progress[11];
+        _current    = new;
+        progress[0] = 0;
+
+        for (int i = 0; i < 10; i++) {
+            if (i < _current) {
+                sprintf(progress + strlen(progress), "#");
+            } else {
+                sprintf(progress + strlen(progress), " ");
+            }
+        }
+        printf("\r[%s]", progress);
+    }
+    fflush(stdout);
+}
+
+void stopProgress()
+{
+    printf("\n");
+    fflush(stdout);
+}
--- a/BasicSolver/3D-mpi-io/src/progress.h
+++ b/BasicSolver/3D-mpi-io/src/progress.h
@@ -0,0 +1,14 @@
+/*
+ * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved.
+ * Use of this source code is governed by a MIT-style
+ * license that can be found in the LICENSE file.
+ */
+#ifndef __PROGRESS_H_
+#define __PROGRESS_H_
+
+extern void initProgress(double);
+extern void printProgress(double);
+extern void stopProgress();
+
+#endif
--- a/BasicSolver/3D-mpi-io/src/solver.c
+++ b/BasicSolver/3D-mpi-io/src/solver.c
@@ -0,0 +1,845 @@
+/*
+ * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of nusif-solver.
+ * Use of this source code is governed by a MIT style
+ * license that can be found in the LICENSE file.
+ */
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "allocate.h"
+#include "comm.h"
+#include "parameter.h"
+#include "solver.h"
+#include "util.h"
+
+#define P(i, j, k)                                                                       \
+    p[(k) * (imaxLocal + 2) * (jmaxLocal + 2) + (j) * (imaxLocal + 2) + (i)]
+#define F(i, j, k)                                                                       \
+    f[(k) * (imaxLocal + 2) * (jmaxLocal + 2) + (j) * (imaxLocal + 2) + (i)]
+#define G(i, j, k)                                                                       \
+    g[(k) * (imaxLocal + 2) * (jmaxLocal + 2) + (j) * (imaxLocal + 2) + (i)]
+#define H(i, j, k)                                                                       \
+    h[(k) * (imaxLocal + 2) * (jmaxLocal + 2) + (j) * (imaxLocal + 2) + (i)]
+#define U(i, j, k)                                                                       \
+    u[(k) * (imaxLocal + 2) * (jmaxLocal + 2) + (j) * (imaxLocal + 2) + (i)]
+#define V(i, j, k)                                                                       \
+    v[(k) * (imaxLocal + 2) * (jmaxLocal + 2) + (j) * (imaxLocal + 2) + (i)]
+#define W(i, j, k)                                                                       \
+    w[(k) * (imaxLocal + 2) * (jmaxLocal + 2) + (j) * (imaxLocal + 2) + (i)]
+#define RHS(i, j, k)                                                                     \
+    rhs[(k) * (imaxLocal + 2) * (jmaxLocal + 2) + (j) * (imaxLocal + 2) + (i)]
+
+static void printConfig(Solver* s)
+{
+    if (commIsMaster(&s->comm)) {
+        printf("Parameters for #%s#\n", s->problem);
+        printf("BC Left:%d Right:%d Bottom:%d Top:%d Front:%d Back:%d\n",
+            s->bcLeft,
+            s->bcRight,
+            s->bcBottom,
+            s->bcTop,
+            s->bcFront,
+            s->bcBack);
+        printf("\tReynolds number: %.2f\n", s->re);
+        printf("\tGx Gy: %.2f %.2f %.2f\n", s->gx, s->gy, s->gz);
+        printf("Geometry data:\n");
+        printf("\tDomain box size (x, y, z): %.2f, %.2f, %.2f\n",
+            s->grid.xlength,
+            s->grid.ylength,
+            s->grid.zlength);
+        printf("\tCells (x, y, z): %d, %d, %d\n",
+            s->grid.imax,
+            s->grid.jmax,
+            s->grid.kmax);
+        printf("\tCell size (dx, dy, dz): %f, %f, %f\n",
+            s->grid.dx,
+            s->grid.dy,
+            s->grid.dz);
+        printf("Timestep parameters:\n");
+        printf("\tDefault stepsize: %.2f, Final time %.2f\n", s->dt, s->te);
+        printf("\tdt bound: %.6f\n", s->dtBound);
+        printf("\tTau factor: %.2f\n", s->tau);
+        printf("Iterative parameters:\n");
+        printf("\tMax iterations: %d\n", s->itermax);
+        printf("\tepsilon (stopping tolerance) : %f\n", s->eps);
+        printf("\tgamma factor: %f\n", s->gamma);
+        printf("\tomega (SOR relaxation): %f\n", s->omega);
+    }
+    commPrintConfig(&s->comm);
+}
+
+void initSolver(Solver* s, Parameter* params)
+{
+    s->problem  = params->name;
+    s->bcLeft   = params->bcLeft;
+    s->bcRight  = params->bcRight;
+    s->bcBottom = params->bcBottom;
+    s->bcTop    = params->bcTop;
+    s->bcFront  = params->bcFront;
+    s->bcBack   = params->bcBack;
+
+    s->grid.imax    = params->imax;
+    s->grid.jmax    = params->jmax;
+    s->grid.kmax    = params->kmax;
+    s->grid.xlength = params->xlength;
+    s->grid.ylength = params->ylength;
+    s->grid.zlength = params->zlength;
+    s->grid.dx      = params->xlength / params->imax;
+    s->grid.dy      = params->ylength / params->jmax;
+    s->grid.dz      = params->zlength / params->kmax;
+
+    s->eps     = params->eps;
+    s->omega   = params->omg;
+    s->itermax = params->itermax;
+    s->re      = params->re;
+    s->gx      = params->gx;
+    s->gy      = params->gy;
+    s->gz      = params->gz;
+    s->dt      = params->dt;
+    s->te      = params->te;
+    s->tau     = params->tau;
+    s->gamma   = params->gamma;
+
+    commInit(&s->comm, s->grid.kmax, s->grid.jmax, s->grid.imax);
+    /* allocate arrays */
+    int imaxLocal = s->comm.imaxLocal;
+    int jmaxLocal = s->comm.jmaxLocal;
+    int kmaxLocal = s->comm.kmaxLocal;
+    size_t size   = (imaxLocal + 2) * (jmaxLocal + 2) * (kmaxLocal + 2);
+
+    s->u   = allocate(64, size * sizeof(double));
+    s->v   = allocate(64, size * sizeof(double));
+    s->w   = allocate(64, size * sizeof(double));
+    s->p   = allocate(64, size * sizeof(double));
+    s->rhs = allocate(64, size * sizeof(double));
+    s->f   = allocate(64, size * sizeof(double));
+    s->g   = allocate(64, size * sizeof(double));
+    s->h   = allocate(64, size * sizeof(double));
+
+    for (int i = 0; i < size; i++) {
+        s->u[i]   = params->u_init;
+        s->v[i]   = params->v_init;
+        s->w[i]   = params->w_init;
+        s->p[i]   = params->p_init;
+        s->rhs[i] = 0.0;
+        s->f[i]   = 0.0;
+        s->g[i]   = 0.0;
+        s->h[i]   = 0.0;
+    }
+
+    double dx = s->grid.dx;
+    double dy = s->grid.dy;
+    double dz = s->grid.dz;
+
+    double invSqrSum = 1.0 / (dx * dx) + 1.0 / (dy * dy) + 1.0 / (dz * dz);
+    s->dtBound       = 0.5 * s->re * 1.0 / invSqrSum;
+
+#ifdef VERBOSE
+    printConfig(s);
+#endif /* VERBOSE */
+}
+
+void computeRHS(Solver* s)
+{
+    int imaxLocal = s->comm.imaxLocal;
+    int jmaxLocal = s->comm.jmaxLocal;
+    int kmaxLocal = s->comm.kmaxLocal;
+
+    double idx = 1.0 / s->grid.dx;
+    double idy = 1.0 / s->grid.dy;
+    double idz = 1.0 / s->grid.dz;
+    double idt = 1.0 / s->dt;
+
+    double* rhs = s->rhs;
+    double* f   = s->f;
+    double* g   = s->g;
+    double* h   = s->h;
+
+    commShift(&s->comm, f, g, h);
+
+    for (int k = 1; k < kmaxLocal + 1; k++) {
+        for (int j = 1; j < jmaxLocal + 1; j++) {
+            for (int i = 1; i < imaxLocal + 1; i++) {
+                RHS(i, j, k) = ((F(i, j, k) - F(i - 1, j, k)) * idx +
+                                   (G(i, j, k) - G(i, j - 1, k)) * idy +
+                                   (H(i, j, k) - H(i, j, k - 1)) * idz) *
+                               idt;
+            }
+        }
+    }
+}
+
+void solve(Solver* s)
+{
+    int imaxLocal = s->comm.imaxLocal;
+    int jmaxLocal = s->comm.jmaxLocal;
+    int kmaxLocal = s->comm.kmaxLocal;
+
+    int imax = s->grid.imax;
+    int jmax = s->grid.jmax;
+    int kmax = s->grid.kmax;
+
+    double eps  = s->eps;
+    int itermax = s->itermax;
+    double dx2  = s->grid.dx * s->grid.dx;
+    double dy2  = s->grid.dy * s->grid.dy;
+    double dz2  = s->grid.dz * s->grid.dz;
+    double idx2 = 1.0 / dx2;
+    double idy2 = 1.0 / dy2;
+    double idz2 = 1.0 / dz2;
+
+    double factor = s->omega * 0.5 * (dx2 * dy2 * dz2) /
+                    (dy2 * dz2 + dx2 * dz2 + dx2 * dy2);
+    double* p    = s->p;
+    double* rhs  = s->rhs;
+    double epssq = eps * eps;
+    int it       = 0;
+    double res   = 1.0;
+    commExchange(&s->comm, p);
+
+    while ((res >= epssq) && (it < itermax)) {
+        res = 0.0;
+
+        for (int k = 1; k < kmaxLocal + 1; k++) {
+            for (int j = 1; j < jmaxLocal + 1; j++) {
+                for (int i = 1; i < imaxLocal + 1; i++) {
+
+                    double r = RHS(i, j, k) -
+                               ((P(i + 1, j, k) - 2.0 * P(i, j, k) + P(i - 1, j, k)) *
+                                       idx2 +
+                                   (P(i, j + 1, k) - 2.0 * P(i, j, k) + P(i, j - 1, k)) *
+                                       idy2 +
+                                   (P(i, j, k + 1) - 2.0 * P(i, j, k) + P(i, j, k - 1)) *
+                                       idz2);
+
+                    P(i, j, k) -= (factor * r);
+                    res += (r * r);
+                }
+            }
+        }
+
+        if (commIsBoundary(&s->comm, FRONT)) {
+            for (int j = 1; j < jmaxLocal + 1; j++) {
+                for (int i = 1; i < imaxLocal + 1; i++) {
+                    P(i, j, 0) = P(i, j, 1);
+                }
+            }
+        }
+
+        if (commIsBoundary(&s->comm, BACK)) {
+            for (int j = 1; j < jmaxLocal + 1; j++) {
+                for (int i = 1; i < imaxLocal + 1; i++) {
+                    P(i, j, kmaxLocal + 1) = P(i, j, kmaxLocal);
+                }
+            }
+        }
+
+        if (commIsBoundary(&s->comm, BOTTOM)) {
+            for (int k = 1; k < kmaxLocal + 1; k++) {
+                for (int i = 1; i < imaxLocal + 1; i++) {
+                    P(i, 0, k) = P(i, 1, k);
+                }
+            }
+        }
+
+        if (commIsBoundary(&s->comm, TOP)) {
+            for (int k = 1; k < kmaxLocal + 1; k++) {
+                for (int i = 1; i < imaxLocal + 1; i++) {
+                    P(i, jmaxLocal + 1, k) = P(i, jmaxLocal, k);
+                }
+            }
+        }
+
+        if (commIsBoundary(&s->comm, LEFT)) {
+            for (int k = 1; k < kmaxLocal + 1; k++) {
+                for (int j = 1; j < jmaxLocal + 1; j++) {
+                    P(0, j, k) = P(1, j, k);
+                }
+            }
+        }
+
+        if (commIsBoundary(&s->comm, RIGHT)) {
+            for (int k = 1; k < kmaxLocal + 1; k++) {
+                for (int j = 1; j < jmaxLocal + 1; j++) {
+                    P(imaxLocal + 1, j, k) = P(imaxLocal, j, k);
+                }
+            }
+        }
+
+        commReduction(&res, SUM);
+        res = res / (double)(imax * jmax * kmax);
+#ifdef DEBUG
+        if (commIsMaster(&s->comm)) {
+            printf("%d Residuum: %e\n", it, res);
+        }
+#endif
+        commExchange(&s->comm, p);
+        it++;
+    }
+
+#ifdef VERBOSE
+    if (commIsMaster(&s->comm)) {
+        printf("Solver took %d iterations to reach %f\n", it, sqrt(res));
+    }
+#endif
+}
+
+static double maxElement(Solver* s, double* m)
+{
+    int size = (s->comm.imaxLocal + 2) * (s->comm.jmaxLocal + 2) *
+               (s->comm.kmaxLocal + 2);
+    double maxval = DBL_MIN;
+
+    for (int i = 0; i < size; i++) {
+        maxval = MAX(maxval, fabs(m[i]));
+    }
+    commReduction(&maxval, MAX);
+    return maxval;
+}
+
+void normalizePressure(Solver* s)
+{
+    int imaxLocal = s->comm.imaxLocal;
+    int jmaxLocal = s->comm.jmaxLocal;
+    int kmaxLocal = s->comm.kmaxLocal;
+
+    double* p   = s->p;
+    double avgP = 0.0;
+
+    for (int k = 1; k < kmaxLocal + 1; k++) {
+        for (int j = 1; j < jmaxLocal + 1; j++) {
+            for (int i = 1; i < imaxLocal + 1; i++) {
+                avgP += P(i, j, k);
+            }
+        }
+    }
+    commReduction(&avgP, SUM);
+    avgP /= (s->grid.imax * s->grid.jmax * s->grid.kmax);
+
+    for (int k = 1; k < kmaxLocal + 1; k++) {
+        for (int j = 1; j < jmaxLocal + 1; j++) {
+            for (int i = 1; i < imaxLocal + 1; i++) {
+                P(i, j, k) = P(i, j, k) - avgP;
+            }
+        }
+    }
+}
+
+void computeTimestep(Solver* s)
+{
+    double dt = s->dtBound;
+    double dx = s->grid.dx;
+    double dy = s->grid.dy;
+    double dz = s->grid.dz;
+
+    double umax = maxElement(s, s->u);
+    double vmax = maxElement(s, s->v);
+    double wmax = maxElement(s, s->w);
+
+    if (umax > 0) {
+        dt = (dt > dx / umax) ? dx / umax : dt;
+    }
+    if (vmax > 0) {
+        dt = (dt > dy / vmax) ? dy / vmax : dt;
+    }
+    if (wmax > 0) {
+        dt = (dt > dz / wmax) ? dz / wmax : dt;
+    }
+
+    s->dt = dt * s->tau;
+}
+
+void setBoundaryConditions(Solver* s)
+{
+    int imaxLocal = s->comm.imaxLocal;
+    int jmaxLocal = s->comm.jmaxLocal;
+    int kmaxLocal = s->comm.kmaxLocal;
+
+    double* u = s->u;
+    double* v = s->v;
+    double* w = s->w;
+
+    if (commIsBoundary(&s->comm, TOP)) {
+        switch (s->bcTop) {
+        case NOSLIP:
+            for (int k = 1; k < kmaxLocal + 1; k++) {
+                for (int i = 1; i < imaxLocal + 1; i++) {
+                    U(i, jmaxLocal + 1, k) = -U(i, jmaxLocal, k);
+                    V(i, jmaxLocal, k)     = 0.0;
+                    W(i, jmaxLocal + 1, k) = -W(i, jmaxLocal, k);
+                }
+            }
+            break;
+        case SLIP:
+            for (int k = 1; k < kmaxLocal + 1; k++) {
+                for (int i = 1; i < imaxLocal + 1; i++) {
+                    U(i, jmaxLocal + 1, k) = U(i, jmaxLocal, k);
+                    V(i, jmaxLocal, k)     = 0.0;
+                    W(i, jmaxLocal + 1, k) = W(i, jmaxLocal, k);
+                }
+            }
+            break;
+        case OUTFLOW:
+            for (int k = 1; k < kmaxLocal + 1; k++) {
+                for (int i = 1; i < imaxLocal + 1; i++) {
+                    U(i, jmaxLocal + 1, k) = U(i, jmaxLocal, k);
+                    V(i, jmaxLocal, k)     = V(i, jmaxLocal - 1, k);
+                    W(i, jmaxLocal + 1, k) = W(i, jmaxLocal, k);
+                }
+            }
+            break;
+        case PERIODIC:
+            break;
+        }
+    }
+
+    if (commIsBoundary(&s->comm, BOTTOM)) {
+        switch (s->bcBottom) {
+        case NOSLIP:
+            for (int k = 1; k < kmaxLocal + 1; k++) {
+                for (int i = 1; i < imaxLocal + 1; i++) {
+                    U(i, 0, k) = -U(i, 1, k);
+                    V(i, 0, k) = 0.0;
+                    W(i, 0, k) = -W(i, 1, k);
+                }
+            }
+            break;
+        case SLIP:
+            for (int k = 1; k < kmaxLocal + 1; k++) {
+                for (int i = 1; i < imaxLocal + 1; i++) {
+                    U(i, 0, k) = U(i, 1, k);
+                    V(i, 0, k) = 0.0;
+                    W(i, 0, k) = W(i, 1, k);
+                }
+            }
+            break;
+        case OUTFLOW:
+            for (int k = 1; k < kmaxLocal + 1; k++) {
+                for (int i = 1; i < imaxLocal + 1; i++) {
+                    U(i, 0, k) = U(i, 1, k);
+                    V(i, 0, k) = V(i, 1, k);
+                    W(i, 0, k) = W(i, 1, k);
+                }
+            }
+            break;
+        case PERIODIC:
+            break;
+        }
+    }
+
+    if (commIsBoundary(&s->comm, LEFT)) {
+        switch (s->bcLeft) {
+        case NOSLIP:
+            for (int k = 1; k < kmaxLocal + 1; k++) {
+                for (int j = 1; j < jmaxLocal + 1; j++) {
+                    U(0, j, k) = 0.0;
+                    V(0, j, k) = -V(1, j, k);
+                    W(0, j, k) = -W(1, j, k);
+                }
+            }
+            break;
+        case SLIP:
+            for (int k = 1; k < kmaxLocal + 1; k++) {
+                for (int j = 1; j < jmaxLocal + 1; j++) {
+                    U(0, j, k) = 0.0;
+                    V(0, j, k) = V(1, j, k);
+                    W(0, j, k) = W(1, j, k);
+                }
+            }
+            break;
+        case OUTFLOW:
+            for (int k = 1; k < kmaxLocal + 1; k++) {
+                for (int j = 1; j < jmaxLocal + 1; j++) {
+                    U(0, j, k) = U(1, j, k);
+                    V(0, j, k) = V(1, j, k);
+                    W(0, j, k) = W(1, j, k);
+                }
+            }
+            break;
+        case PERIODIC:
+            break;
+        }
+    }
+
+    if (commIsBoundary(&s->comm, RIGHT)) {
+        switch (s->bcRight) {
+        case NOSLIP:
+            for (int k = 1; k < kmaxLocal + 1; k++) {
+                for (int j = 1; j < jmaxLocal + 1; j++) {
+                    U(imaxLocal, j, k)     = 0.0;
+                    V(imaxLocal + 1, j, k) = -V(imaxLocal, j, k);
+                    W(imaxLocal + 1, j, k) = -W(imaxLocal, j, k);
+                }
+            }
+            break;
+        case SLIP:
+            for (int k = 1; k < kmaxLocal + 1; k++) {
+                for (int j = 1; j < jmaxLocal + 1; j++) {
+                    U(imaxLocal, j, k)     = 0.0;
+                    V(imaxLocal + 1, j, k) = V(imaxLocal, j, k);
+                    W(imaxLocal + 1, j, k) = W(imaxLocal, j, k);
+                }
+            }
+            break;
+        case OUTFLOW:
+            for (int k = 1; k < kmaxLocal + 1; k++) {
+                for (int j = 1; j < jmaxLocal + 1; j++) {
+                    U(imaxLocal, j, k)     = U(imaxLocal - 1, j, k);
+                    V(imaxLocal + 1, j, k) = V(imaxLocal, j, k);
+                    W(imaxLocal + 1, j, k) = W(imaxLocal, j, k);
+                }
+            }
+            break;
+        case PERIODIC:
+            break;
+        }
+    }
+
+    if (commIsBoundary(&s->comm, FRONT)) {
+        switch (s->bcFront) {
+        case NOSLIP:
+            for (int j = 1; j < jmaxLocal + 1; j++) {
+                for (int i = 1; i < imaxLocal + 1; i++) {
+                    U(i, j, 0) = -U(i, j, 1);
+                    V(i, j, 0) = -V(i, j, 1);
+                    W(i, j, 0) = 0.0;
+                }
+            }
+            break;
+        case SLIP:
+            for (int j = 1; j < jmaxLocal + 1; j++) {
+                for (int i = 1; i < imaxLocal + 1; i++) {
+                    U(i, j, 0) = U(i, j, 1);
+                    V(i, j, 0) = V(i, j, 1);
+                    W(i, j, 0) = 0.0;
+                }
+            }
+            break;
+        case OUTFLOW:
+            for (int j = 1; j < jmaxLocal + 1; j++) {
+                for (int i = 1; i < imaxLocal + 1; i++) {
+                    U(i, j, 0) = U(i, j, 1);
+                    V(i, j, 0) = V(i, j, 1);
+                    W(i, j, 0) = W(i, j, 1);
+                }
+            }
+            break;
+        case PERIODIC:
+            break;
+        }
+    }
+
+    if (commIsBoundary(&s->comm, BACK)) {
+        switch (s->bcBack) {
+        case NOSLIP:
+            for (int j = 1; j < jmaxLocal + 1; j++) {
+                for (int i = 1; i < imaxLocal + 1; i++) {
+                    U(i, j, kmaxLocal + 1) = -U(i, j, kmaxLocal);
+                    V(i, j, kmaxLocal + 1) = -V(i, j, kmaxLocal);
+                    W(i, j, kmaxLocal)     = 0.0;
+                }
+            }
+            break;
+        case SLIP:
+            for (int j = 1; j < jmaxLocal + 1; j++) {
+                for (int i = 1; i < imaxLocal + 1; i++) {
+                    U(i, j, kmaxLocal + 1) = U(i, j, kmaxLocal);
+                    V(i, j, kmaxLocal + 1) = V(i, j, kmaxLocal);
+                    W(i, j, kmaxLocal)     = 0.0;
+                }
+            }
+            break;
+        case OUTFLOW:
+            for (int j = 1; j < jmaxLocal + 1; j++) {
+                for (int i = 1; i < imaxLocal + 1; i++) {
+                    U(i, j, kmaxLocal + 1) = U(i, j, kmaxLocal);
+                    V(i, j, kmaxLocal + 1) = V(i, j, kmaxLocal);
+                    W(i, j, kmaxLocal)     = W(i, j, kmaxLocal - 1);
+                }
+            }
+            break;
+        case PERIODIC:
+            break;
+        }
+    }
+}
+
+void setSpecialBoundaryCondition(Solver* s)
+{
+    int imaxLocal = s->comm.imaxLocal;
+    int jmaxLocal = s->comm.jmaxLocal;
+    int kmaxLocal = s->comm.kmaxLocal;
+
+    double* u = s->u;
+
+    if (strcmp(s->problem, "dcavity") == 0) {
+        if (commIsBoundary(&s->comm, TOP)) {
+            for (int k = 1; k < kmaxLocal; k++) {
+                for (int i = 1; i < imaxLocal; i++) {
+                    U(i, jmaxLocal + 1, k) = 2.0 - U(i, jmaxLocal, k);
+                }
+            }
+        }
+    } else if (strcmp(s->problem, "canal") == 0) {
+        if (commIsBoundary(&s->comm, LEFT)) {
+            for (int k = 1; k < kmaxLocal + 1; k++) {
+                for (int j = 1; j < jmaxLocal + 1; j++) {
+                    U(0, j, k) = 2.0;
+                }
+            }
+        }
+    }
+}
+
+void computeFG(Solver* s)
+{
+    int imaxLocal = s->comm.imaxLocal;
+    int jmaxLocal = s->comm.jmaxLocal;
+    int kmaxLocal = s->comm.kmaxLocal;
+
+    double* u = s->u;
+    double* v = s->v;
+    double* w = s->w;
+    double* f = s->f;
+    double* g = s->g;
+    double* h = s->h;
+
+    double gx = s->gx;
+    double gy = s->gy;
+    double gz = s->gz;
+    double dt = s->dt;
+
+    double gamma     = s->gamma;
+    double inverseRe = 1.0 / s->re;
+    double inverseDx = 1.0 / s->grid.dx;
+    double inverseDy = 1.0 / s->grid.dy;
+    double inverseDz = 1.0 / s->grid.dz;
+    double du2dx, dv2dy, dw2dz;
+    double duvdx, duwdx, duvdy, dvwdy, duwdz, dvwdz;
+    double du2dx2, du2dy2, du2dz2;
+    double dv2dx2, dv2dy2, dv2dz2;
+    double dw2dx2, dw2dy2, dw2dz2;
+
+    commExchange(&s->comm, u);
+    commExchange(&s->comm, v);
+    commExchange(&s->comm, w);
+
+    for (int k = 1; k < kmaxLocal + 1; k++) {
+        for (int j = 1; j < jmaxLocal + 1; j++) {
+            for (int i = 1; i < imaxLocal + 1; i++) {
+                du2dx = inverseDx * 0.25 *
+                            ((U(i, j, k) + U(i + 1, j, k)) *
+                                    (U(i, j, k) + U(i + 1, j, k)) -
+                                (U(i, j, k) + U(i - 1, j, k)) *
+                                    (U(i, j, k) + U(i - 1, j, k))) +
+                        gamma * inverseDx * 0.25 *
+                            (fabs(U(i, j, k) + U(i + 1, j, k)) *
+                                    (U(i, j, k) - U(i + 1, j, k)) +
+                                fabs(U(i, j, k) + U(i - 1, j, k)) *
+                                    (U(i, j, k) - U(i - 1, j, k)));
+
+                duvdy = inverseDy * 0.25 *
+                            ((V(i, j, k) + V(i + 1, j, k)) *
+                                    (U(i, j, k) + U(i, j + 1, k)) -
+                                (V(i, j - 1, k) + V(i + 1, j - 1, k)) *
+                                    (U(i, j, k) + U(i, j - 1, k))) +
+                        gamma * inverseDy * 0.25 *
+                            (fabs(V(i, j, k) + V(i + 1, j, k)) *
+                                    (U(i, j, k) - U(i, j + 1, k)) +
+                                fabs(V(i, j - 1, k) + V(i + 1, j - 1, k)) *
+                                    (U(i, j, k) - U(i, j - 1, k)));
+
+                duwdz = inverseDz * 0.25 *
+                            ((W(i, j, k) + W(i + 1, j, k)) *
+                                    (U(i, j, k) + U(i, j, k + 1)) -
+                                (W(i, j, k - 1) + W(i + 1, j, k - 1)) *
+                                    (U(i, j, k) + U(i, j, k - 1))) +
+                        gamma * inverseDz * 0.25 *
+                            (fabs(W(i, j, k) + W(i + 1, j, k)) *
+                                    (U(i, j, k) - U(i, j, k + 1)) +
+                                fabs(W(i, j, k - 1) + W(i + 1, j, k - 1)) *
+                                    (U(i, j, k) - U(i, j, k - 1)));
+
+                du2dx2 = inverseDx * inverseDx *
+                         (U(i + 1, j, k) - 2.0 * U(i, j, k) + U(i - 1, j, k));
+                du2dy2 = inverseDy * inverseDy *
+                         (U(i, j + 1, k) - 2.0 * U(i, j, k) + U(i, j - 1, k));
+                du2dz2 = inverseDz * inverseDz *
+                         (U(i, j, k + 1) - 2.0 * U(i, j, k) + U(i, j, k - 1));
+                F(i, j, k) = U(i, j, k) + dt * (inverseRe * (du2dx2 + du2dy2 + du2dz2) -
+                                                   du2dx - duvdy - duwdz + gx);
+
+                duvdx = inverseDx * 0.25 *
+                            ((U(i, j, k) + U(i, j + 1, k)) *
+                                    (V(i, j, k) + V(i + 1, j, k)) -
+                                (U(i - 1, j, k) + U(i - 1, j + 1, k)) *
+                                    (V(i, j, k) + V(i - 1, j, k))) +
+                        gamma * inverseDx * 0.25 *
+                            (fabs(U(i, j, k) + U(i, j + 1, k)) *
+                                    (V(i, j, k) - V(i + 1, j, k)) +
+                                fabs(U(i - 1, j, k) + U(i - 1, j + 1, k)) *
+                                    (V(i, j, k) - V(i - 1, j, k)));
+
+                dv2dy = inverseDy * 0.25 *
+                            ((V(i, j, k) + V(i, j + 1, k)) *
+                                    (V(i, j, k) + V(i, j + 1, k)) -
+                                (V(i, j, k) + V(i, j - 1, k)) *
+                                    (V(i, j, k) + V(i, j - 1, k))) +
+                        gamma * inverseDy * 0.25 *
+                            (fabs(V(i, j, k) + V(i, j + 1, k)) *
+                                    (V(i, j, k) - V(i, j + 1, k)) +
+                                fabs(V(i, j, k) + V(i, j - 1, k)) *
+                                    (V(i, j, k) - V(i, j - 1, k)));
+
+                dvwdz = inverseDz * 0.25 *
+                            ((W(i, j, k) + W(i, j + 1, k)) *
+                                    (V(i, j, k) + V(i, j, k + 1)) -
+                                (W(i, j, k - 1) + W(i, j + 1, k - 1)) *
+                                    (V(i, j, k) + V(i, j, k + 1))) +
+                        gamma * inverseDz * 0.25 *
+                            (fabs(W(i, j, k) + W(i, j + 1, k)) *
+                                    (V(i, j, k) - V(i, j, k + 1)) +
+                                fabs(W(i, j, k - 1) + W(i, j + 1, k - 1)) *
+                                    (V(i, j, k) - V(i, j, k + 1)));
+
+                dv2dx2 = inverseDx * inverseDx *
+                         (V(i + 1, j, k) - 2.0 * V(i, j, k) + V(i - 1, j, k));
+                dv2dy2 = inverseDy * inverseDy *
+                         (V(i, j + 1, k) - 2.0 * V(i, j, k) + V(i, j - 1, k));
+                dv2dz2 = inverseDz * inverseDz *
+                         (V(i, j, k + 1) - 2.0 * V(i, j, k) + V(i, j, k - 1));
+                G(i, j, k) = V(i, j, k) + dt * (inverseRe * (dv2dx2 + dv2dy2 + dv2dz2) -
+                                                   duvdx - dv2dy - dvwdz + gy);
+
+                duwdx = inverseDx * 0.25 *
+                            ((U(i, j, k) + U(i, j, k + 1)) *
+                                    (W(i, j, k) + W(i + 1, j, k)) -
+                                (U(i - 1, j, k) + U(i - 1, j, k + 1)) *
+                                    (W(i, j, k) + W(i - 1, j, k))) +
+                        gamma * inverseDx * 0.25 *
+                            (fabs(U(i, j, k) + U(i, j, k + 1)) *
+                                    (W(i, j, k) - W(i + 1, j, k)) +
+                                fabs(U(i - 1, j, k) + U(i - 1, j, k + 1)) *
+                                    (W(i, j, k) - W(i - 1, j, k)));
+
+                dvwdy = inverseDy * 0.25 *
+                            ((V(i, j, k) + V(i, j, k + 1)) *
+                                    (W(i, j, k) + W(i, j + 1, k)) -
+                                (V(i, j - 1, k + 1) + V(i, j - 1, k)) *
+                                    (W(i, j, k) + W(i, j - 1, k))) +
+                        gamma * inverseDy * 0.25 *
+                            (fabs(V(i, j, k) + V(i, j, k + 1)) *
+                                    (W(i, j, k) - W(i, j + 1, k)) +
+                                fabs(V(i, j - 1, k + 1) + V(i, j - 1, k)) *
+                                    (W(i, j, k) - W(i, j - 1, k)));
+
+                dw2dz = inverseDz * 0.25 *
+                            ((W(i, j, k) + W(i, j, k + 1)) *
+                                    (W(i, j, k) + W(i, j, k + 1)) -
+                                (W(i, j, k) + W(i, j, k - 1)) *
+                                    (W(i, j, k) + W(i, j, k - 1))) +
+                        gamma * inverseDz * 0.25 *
+                            (fabs(W(i, j, k) + W(i, j, k + 1)) *
+                                    (W(i, j, k) - W(i, j, k + 1)) +
+                                fabs(W(i, j, k) + W(i, j, k - 1)) *
+                                    (W(i, j, k) - W(i, j, k - 1)));
+
+                dw2dx2 = inverseDx * inverseDx *
+                         (W(i + 1, j, k) - 2.0 * W(i, j, k) + W(i - 1, j, k));
+                dw2dy2 = inverseDy * inverseDy *
+                         (W(i, j + 1, k) - 2.0 * W(i, j, k) + W(i, j - 1, k));
+                dw2dz2 = inverseDz * inverseDz *
+                         (W(i, j, k + 1) - 2.0 * W(i, j, k) + W(i, j, k - 1));
+                H(i, j, k) = W(i, j, k) + dt * (inverseRe * (dw2dx2 + dw2dy2 + dw2dz2) -
+                                                   duwdx - dvwdy - dw2dz + gz);
+            }
+        }
+    }
+
+    /* ----------------------------- boundary of F ---------------------------
+     */
+    if (commIsBoundary(&s->comm, LEFT)) {
+        for (int k = 1; k < kmaxLocal + 1; k++) {
+            for (int j = 1; j < jmaxLocal + 1; j++) {
+                F(0, j, k) = U(0, j, k);
+            }
+        }
+    }
+
+    if (commIsBoundary(&s->comm, RIGHT)) {
+        for (int k = 1; k < kmaxLocal + 1; k++) {
+            for (int j = 1; j < jmaxLocal + 1; j++) {
+                F(imaxLocal, j, k) = U(imaxLocal, j, k);
+            }
+        }
+    }
+
+    /* ----------------------------- boundary of G ---------------------------
+     */
+    if (commIsBoundary(&s->comm, BOTTOM)) {
+        for (int k = 1; k < kmaxLocal + 1; k++) {
+            for (int i = 1; i < imaxLocal + 1; i++) {
+                G(i, 0, k) = V(i, 0, k);
+            }
+        }
+    }
+
+    if (commIsBoundary(&s->comm, TOP)) {
+        for (int k = 1; k < kmaxLocal + 1; k++) {
+            for (int i = 1; i < imaxLocal + 1; i++) {
+                G(i, jmaxLocal, k) = V(i, jmaxLocal, k);
+            }
+        }
+    }
+
+    /* ----------------------------- boundary of H ---------------------------
+     */
+    if (commIsBoundary(&s->comm, FRONT)) {
+        for (int j = 1; j < jmaxLocal + 1; j++) {
+            for (int i = 1; i < imaxLocal + 1; i++) {
+                H(i, j, 0) = W(i, j, 0);
+            }
+        }
+    }
+
+    if (commIsBoundary(&s->comm, BACK)) {
+        for (int j = 1; j < jmaxLocal + 1; j++) {
+            for (int i = 1; i < imaxLocal + 1; i++) {
+                H(i, j, kmaxLocal) = W(i, j, kmaxLocal);
+            }
+        }
+    }
+}
+
+void adaptUV(Solver* s)
+{
+    int imaxLocal = s->comm.imaxLocal;
+    int jmaxLocal = s->comm.jmaxLocal;
+    int kmaxLocal = s->comm.kmaxLocal;
+
+    double* p = s->p;
+    double* u = s->u;
+    double* v = s->v;
+    double* w = s->w;
+    double* f = s->f;
+    double* g = s->g;
+    double* h = s->h;
+
+    double factorX = s->dt / s->grid.dx;
+    double factorY = s->dt / s->grid.dy;
+    double factorZ = s->dt / s->grid.dz;
+
+    for (int k = 1; k < kmaxLocal + 1; k++) {
+        for (int j = 1; j < jmaxLocal + 1; j++) {
+            for (int i = 1; i < imaxLocal + 1; i++) {
+                U(i, j, k) = F(i, j, k) - (P(i + 1, j, k) - P(i, j, k)) * factorX;
+                V(i, j, k) = G(i, j, k) - (P(i, j + 1, k) - P(i, j, k)) * factorY;
+                W(i, j, k) = H(i, j, k) - (P(i, j, k + 1) - P(i, j, k)) * factorZ;
+            }
+        }
+    }
+}
--- a/BasicSolver/3D-mpi-io/src/solver.h
+++ b/BasicSolver/3D-mpi-io/src/solver.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of nusif-solver.
+ * Use of this source code is governed by a MIT style
+ * license that can be found in the LICENSE file.
+ */
+#ifndef __SOLVER_H_
+#define __SOLVER_H_
+#include "comm.h"
+#include "grid.h"
+#include "parameter.h"
+
+enum BC { NOSLIP = 1, SLIP, OUTFLOW, PERIODIC };
+
+typedef struct {
+    /* geometry and grid information */
+    Grid grid;
+    /* arrays */
+    double *p, *rhs;
+    double *f, *g, *h;
+    double *u, *v, *w;
+    /* parameters */
+    double eps, omega;
+    double re, tau, gamma;
+    double gx, gy, gz;
+    /* time stepping */
+    int itermax;
+    double dt, te;
+    double dtBound;
+    char* problem;
+    int bcLeft, bcRight, bcBottom, bcTop, bcFront, bcBack;
+    /* communication */
+    Comm comm;
+} Solver;
+
+void initSolver(Solver*, Parameter*);
+void computeRHS(Solver*);
+void solve(Solver*);
+void normalizePressure(Solver*);
+void computeTimestep(Solver*);
+void setBoundaryConditions(Solver*);
+void setSpecialBoundaryCondition(Solver*);
+void computeFG(Solver*);
+void adaptUV(Solver*);
+#endif
--- a/BasicSolver/3D-mpi-io/src/test.c
+++ b/BasicSolver/3D-mpi-io/src/test.c
@@ -0,0 +1,118 @@
+/*
+ * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved.
+ * Use of this source code is governed by a MIT-style
+ * license that can be found in the LICENSE file.
+ */
+#include <stdio.h>
+#include <string.h>
+
+#include "test.h"
+
+#define G(v, i, j, k)                                                                    \
+    v[(k) * (imaxLocal + 2) * (jmaxLocal + 2) + (j) * (imaxLocal + 2) + (i)]
+
+void testInit(Solver* s)
+{
+    int imaxLocal = s->comm.imaxLocal;
+    int jmaxLocal = s->comm.jmaxLocal;
+    int kmaxLocal = s->comm.kmaxLocal;
+    int myrank    = s->comm.rank;
+    double* p     = s->p;
+    double* f     = s->f;
+    double* g     = s->g;
+    double* h     = s->h;
+
+    for (int k = 0; k < kmaxLocal + 2; k++) {
+        for (int j = 0; j < jmaxLocal + 2; j++) {
+            for (int i = 0; i < imaxLocal + 2; i++) {
+                G(p, i, j, k) = myrank;
+                G(f, i, j, k) = myrank;
+                G(g, i, j, k) = myrank;
+                G(h, i, j, k) = myrank;
+            }
+        }
+    }
+}
+
+static char* direction2String(Direction dir)
+{
+    switch (dir) {
+    case LEFT:
+        return "left";
+        break;
+    case RIGHT:
+        return "right";
+        break;
+    case BOTTOM:
+        return "bottom";
+        break;
+    case TOP:
+        return "top";
+        break;
+    case FRONT:
+        return "front";
+        break;
+    case BACK:
+        return "back";
+        break;
+    case NDIRS:
+        return "ERROR";
+    default:
+        return "ERROR";
+    }
+}
+
+static void printPlane(Solver* s, double* a, int ymax, int xmax, Direction dir)
+{
+    int imaxLocal = s->comm.imaxLocal;
+    int jmaxLocal = s->comm.jmaxLocal;
+    int kmaxLocal = s->comm.kmaxLocal;
+    char filename[50];
+    snprintf(filename, 50, "halo-%s-r%d.txt", direction2String(dir), s->comm.rank);
+    FILE* fh = fopen(filename, "w");
+
+    for (int y = 0; y < ymax; y++) {
+        for (int x = 0; x < xmax; x++) {
+            switch (dir) {
+            case LEFT:
+                fprintf(fh, "%12.8f ", G(a, 0, x, y));
+                break;
+            case RIGHT:
+                fprintf(fh, "%12.8f ", G(a, imaxLocal + 1, x, y));
+                break;
+            case BOTTOM:
+                fprintf(fh, "%12.8f ", G(a, x, 0, y));
+                break;
+            case TOP:
+                fprintf(fh, "%12.8f ", G(a, x, jmaxLocal + 1, y));
+                break;
+            case FRONT:
+                fprintf(fh, "%12.8f ", G(a, x, y, 0));
+                break;
+            case BACK:
+                fprintf(fh, "%12.8f ", G(a, x, y, kmaxLocal + 1));
+                break;
+            case NDIRS:
+                printf("ERROR\n");
+                break;
+            }
+        }
+        fprintf(fh, "\n");
+    }
+    fclose(fh);
+}
+
+void testPrintHalo(Solver* s, double* a)
+{
+    int imaxLocal = s->comm.imaxLocal;
+    int jmaxLocal = s->comm.jmaxLocal;
+    int kmaxLocal = s->comm.kmaxLocal;
+
+    printPlane(s, a, kmaxLocal + 2, imaxLocal + 2, BOTTOM);
+    printPlane(s, a, kmaxLocal + 2, imaxLocal + 2, TOP);
+    printPlane(s, a, kmaxLocal + 2, jmaxLocal + 2, LEFT);
+    printPlane(s, a, kmaxLocal + 2, jmaxLocal + 2, RIGHT);
+    printPlane(s, a, jmaxLocal + 2, imaxLocal + 2, FRONT);
+    printPlane(s, a, jmaxLocal + 2, imaxLocal + 2, BACK);
+}
--- a/BasicSolver/3D-mpi-io/src/test.h
+++ b/BasicSolver/3D-mpi-io/src/test.h
@@ -0,0 +1,13 @@
+/*
+ * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of nusif-solver.
+ * Use of this source code is governed by a MIT style
+ * license that can be found in the LICENSE file.
+ */
+#ifndef __TEST_H_
+#define __TEST_H_
+#include "solver.h"
+
+extern void testInit(Solver* s);
+extern void testPrintHalo(Solver* s, double* a);
+#endif // __TEST_H_
--- a/BasicSolver/3D-mpi-io/src/timing.c
+++ b/BasicSolver/3D-mpi-io/src/timing.c
@@ -0,0 +1,24 @@
+/*
+ * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved.
+ * Use of this source code is governed by a MIT-style
+ * license that can be found in the LICENSE file.
+ */
+#include <stdlib.h>
+#include <time.h>
+
+double getTimeStamp()
+{
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (double)ts.tv_sec + (double)ts.tv_nsec * 1.e-9;
+}
+
+double getTimeResolution()
+{
+    struct timespec ts;
+    clock_getres(CLOCK_MONOTONIC, &ts);
+    return (double)ts.tv_sec + (double)ts.tv_nsec * 1.e-9;
+}
+
+double getTimeStamp_() { return getTimeStamp(); }
--- a/BasicSolver/3D-mpi-io/src/timing.h
+++ b/BasicSolver/3D-mpi-io/src/timing.h
@@ -0,0 +1,14 @@
+/*
+ * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved.
+ * Use of this source code is governed by a MIT-style
+ * license that can be found in the LICENSE file.
+ */
+#ifndef __TIMING_H_
+#define __TIMING_H_
+
+extern double getTimeStamp();
+extern double getTimeResolution();
+extern double getTimeStamp_();
+
+#endif // __TIMING_H_
--- a/BasicSolver/3D-mpi-io/src/util.h
+++ b/BasicSolver/3D-mpi-io/src/util.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved.
+ * Use of this source code is governed by a MIT-style
+ * license that can be found in the LICENSE file.
+ */
+#ifndef __UTIL_H_
+#define __UTIL_H_
+#define HLINE                                                                            \
+    "----------------------------------------------------------------------------\n"
+
+#ifndef MIN
+#define MIN(x, y) ((x) < (y) ? (x) : (y))
+#endif
+#ifndef MAX
+#define MAX(x, y) ((x) > (y) ? (x) : (y))
+#endif
+#ifndef ABS
+#define ABS(a) ((a) >= 0 ? (a) : -(a))
+#endif
+
+#endif // __UTIL_H_
--- a/BasicSolver/3D-mpi-io/src/vtkWriter.c
+++ b/BasicSolver/3D-mpi-io/src/vtkWriter.c
@@ -0,0 +1,165 @@
+/*
+ * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of nusif-solver.
+ * Use of this source code is governed by a MIT style
+ * license that can be found in the LICENSE file.
+ */
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "vtkWriter.h"
+#define G(v, i, j, k) v[(k)*imax * jmax + (j)*imax + (i)]
+
+static float floatSwap(float f)
+{
+    union {
+        float f;
+        char b[4];
+    } dat1, dat2;
+
+    dat1.f    = f;
+    dat2.b[0] = dat1.b[3];
+    dat2.b[1] = dat1.b[2];
+    dat2.b[2] = dat1.b[1];
+    dat2.b[3] = dat1.b[0];
+    return dat2.f;
+}
+
+static void writeHeader(VtkOptions* o)
+{
+    fprintf(o->fh, "# vtk DataFile Version 3.0\n");
+    fprintf(o->fh, "PAMPI cfd solver output\n");
+    if (o->fmt == ASCII) {
+        fprintf(o->fh, "ASCII\n");
+    } else if (o->fmt == BINARY) {
+        fprintf(o->fh, "BINARY\n");
+    }
+
+    fprintf(o->fh, "DATASET STRUCTURED_POINTS\n");
+    fprintf(o->fh, "DIMENSIONS %d %d %d\n", o->grid.imax, o->grid.jmax, o->grid.kmax);
+    fprintf(o->fh,
+        "ORIGIN %f %f %f\n",
+        o->grid.dx * 0.5,
+        o->grid.dy * 0.5,
+        o->grid.dz * 0.5);
+    fprintf(o->fh, "SPACING %f %f %f\n", o->grid.dx, o->grid.dy, o->grid.dz);
+    fprintf(o->fh, "POINT_DATA %d\n", o->grid.imax * o->grid.jmax * o->grid.kmax);
+}
+
+void vtkOpen(VtkOptions* o, char* problem)
+{
+    char filename[50];
+
+    if (o->mode == UNIX) {
+        if (commIsMaster(&o->comm)) {
+            snprintf(filename, 50, "%s-p%d.vtk", problem, o->comm.size);
+            o->fh = fopen(filename, "w");
+            writeHeader(o);
+        }
+    } else if (o->mode == MPI) {
+    }
+
+    if (commIsMaster(&o->comm)) printf("Writing VTK output for %s\n", problem);
+}
+
+static void writeScalar(VtkOptions* o, double* s)
+{
+    int imax = o->grid.imax;
+    int jmax = o->grid.jmax;
+    int kmax = o->grid.kmax;
+
+    for (int k = 0; k < kmax; k++) {
+        for (int j = 0; j < jmax; j++) {
+            for (int i = 0; i < imax; i++) {
+                if (o->fmt == ASCII) {
+                    fprintf(o->fh, "%f\n", G(s, i, j, k));
+                } else if (o->fmt == BINARY) {
+                    fwrite((float[1]) { floatSwap(G(s, i, j, k)) },
+                        sizeof(float),
+                        1,
+                        o->fh);
+                }
+            }
+        }
+    }
+    if (o->fmt == BINARY) fprintf(o->fh, "\n");
+}
+
+static bool isInitialized(FILE* ptr)
+{
+    if (ptr == NULL) {
+        printf("vtkWriter not initialize! Call vtkOpen first!\n");
+        return false;
+    }
+    return true;
+}
+
+void vtkScalar(VtkOptions* o, char* name, double* s)
+{
+    if (commIsMaster(&o->comm)) printf("Register scalar %s\n", name);
+
+    if (o->mode == UNIX) {
+        if (commIsMaster(&o->comm)) {
+            if (!isInitialized(o->fh)) return;
+            fprintf(o->fh, "SCALARS %s float 1\n", name);
+            fprintf(o->fh, "LOOKUP_TABLE default\n");
+            writeScalar(o, s);
+        }
+    } else if (o->mode == MPI) {
+    }
+}
+
+static void writeVector(VtkOptions* o, VtkVector vec)
+{
+    int imax = o->grid.imax;
+    int jmax = o->grid.jmax;
+    int kmax = o->grid.kmax;
+
+    for (int k = 0; k < kmax; k++) {
+        for (int j = 0; j < jmax; j++) {
+            for (int i = 0; i < imax; i++) {
+                if (o->fmt == ASCII) {
+                    fprintf(o->fh,
+                        "%f %f %f\n",
+                        G(vec.u, i, j, k),
+                        G(vec.v, i, j, k),
+                        G(vec.w, i, j, k));
+                } else if (o->fmt == BINARY) {
+                    fwrite((float[3]) { floatSwap(G(vec.u, i, j, k)),
+                               floatSwap(G(vec.v, i, j, k)),
+                               floatSwap(G(vec.w, i, j, k)) },
+                        sizeof(float),
+                        3,
+                        o->fh);
+                }
+            }
+        }
+    }
+    if (o->fmt == BINARY) fprintf(o->fh, "\n");
+}
+
+void vtkVector(VtkOptions* o, char* name, VtkVector vec)
+{
+    if (commIsMaster(&o->comm)) printf("Register vector %s\n", name);
+
+    if (o->mode == UNIX) {
+        if (commIsMaster(&o->comm)) {
+            if (!isInitialized(o->fh)) return;
+            fprintf(o->fh, "VECTORS %s float\n", name);
+            writeVector(o, vec);
+        }
+    } else if (o->mode == MPI) {
+    }
+}
+
+void vtkClose(VtkOptions* o)
+{
+    if (o->mode == UNIX) {
+        if (commIsMaster(&o->comm)) {
+            fclose(o->fh);
+            o->fh = NULL;
+        }
+    } else if (o->mode == MPI) {
+    }
+}
--- a/BasicSolver/3D-mpi-io/src/vtkWriter.h
+++ b/BasicSolver/3D-mpi-io/src/vtkWriter.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2022 NHR@FAU, University Erlangen-Nuremberg.
+ * All rights reserved. This file is part of nusif-solver.
+ * Use of this source code is governed by a MIT style
+ * license that can be found in the LICENSE file.
+ */
+#ifndef __VTKWRITER_H_
+#define __VTKWRITER_H_
+#include <stdio.h>
+
+#include "comm.h"
+#include "grid.h"
+
+typedef enum VtkFormat { ASCII = 0, BINARY } VtkFormat;
+typedef enum VtkMode { UNIX = 0, MPI } VtkMode;
+
+typedef struct VtkOptions {
+    VtkFormat fmt;
+    VtkMode mode;
+    Grid grid;
+    FILE* fh;
+    Comm comm;
+} VtkOptions;
+
+typedef struct VtkVector {
+    double *u, *v, *w;
+} VtkVector;
+
+extern void vtkOpen(VtkOptions* opts, char* filename);
+extern void vtkVector(VtkOptions* opts, char* name, VtkVector vec);
+extern void vtkScalar(VtkOptions* opts, char* name, double* p);
+extern void vtkClose(VtkOptions* opts);
+#endif // __VTKWRITER_H_
--- a/BasicSolver/3D-mpi/.cache/clangd/index/affinity.c.B7C05C30E5FF4519.idx
+++ b/BasicSolver/3D-mpi/.cache/clangd/index/affinity.c.B7C05C30E5FF4519.idx
--- a/BasicSolver/3D-mpi/.cache/clangd/index/allocate.c.ECF0527ECC6DF781.idx
+++ b/BasicSolver/3D-mpi/.cache/clangd/index/allocate.c.ECF0527ECC6DF781.idx
--- a/BasicSolver/3D-mpi/.cache/clangd/index/allocate.h.E842D58079DB450B.idx
+++ b/BasicSolver/3D-mpi/.cache/clangd/index/allocate.h.E842D58079DB450B.idx
--- a/BasicSolver/3D-mpi/.cache/clangd/index/main.c.DC3EC91E37026717.idx
+++ b/BasicSolver/3D-mpi/.cache/clangd/index/main.c.DC3EC91E37026717.idx
--- a/BasicSolver/3D-mpi/.cache/clangd/index/parameter.c.C63CC045C4D27867.idx
+++ b/BasicSolver/3D-mpi/.cache/clangd/index/parameter.c.C63CC045C4D27867.idx
--- a/BasicSolver/3D-mpi/.cache/clangd/index/parameter.h.4BBCC8C05260A054.idx
+++ b/BasicSolver/3D-mpi/.cache/clangd/index/parameter.h.4BBCC8C05260A054.idx
--- a/BasicSolver/3D-mpi/.cache/clangd/index/progress.c.6A59AB8463A0FDB4.idx
+++ b/BasicSolver/3D-mpi/.cache/clangd/index/progress.c.6A59AB8463A0FDB4.idx
--- a/BasicSolver/3D-mpi/.cache/clangd/index/progress.h.00B7B5207E4C4A3D.idx
+++ b/BasicSolver/3D-mpi/.cache/clangd/index/progress.h.00B7B5207E4C4A3D.idx
--- a/BasicSolver/3D-mpi/.cache/clangd/index/solver.c.C754E0A082732D4F.idx
+++ b/BasicSolver/3D-mpi/.cache/clangd/index/solver.c.C754E0A082732D4F.idx
--- a/BasicSolver/3D-mpi/.cache/clangd/index/solver.h.4DC1D59CDF9898D1.idx
+++ b/BasicSolver/3D-mpi/.cache/clangd/index/solver.h.4DC1D59CDF9898D1.idx
--- a/BasicSolver/3D-mpi/.cache/clangd/index/timing.c.3A026594A4A42B70.idx
+++ b/BasicSolver/3D-mpi/.cache/clangd/index/timing.c.3A026594A4A42B70.idx
--- a/BasicSolver/3D-mpi/.cache/clangd/index/timing.h.65881B97289F66D9.idx
+++ b/BasicSolver/3D-mpi/.cache/clangd/index/timing.h.65881B97289F66D9.idx
--- a/BasicSolver/3D-mpi/.cache/clangd/index/util.h.47A029E807954814.idx
+++ b/BasicSolver/3D-mpi/.cache/clangd/index/util.h.47A029E807954814.idx
--- a/BasicSolver/3D-mpi/.cache/clangd/index/vtkWriter.c.D6920E8109B5D2E8.idx
+++ b/BasicSolver/3D-mpi/.cache/clangd/index/vtkWriter.c.D6920E8109B5D2E8.idx
--- a/BasicSolver/3D-mpi/.cache/clangd/index/vtkWriter.h.27987C0C4DAEF54F.idx
+++ b/BasicSolver/3D-mpi/.cache/clangd/index/vtkWriter.h.27987C0C4DAEF54F.idx
--- a/BasicSolver/README.md
+++ b/BasicSolver/README.md
@@ -8,35 +8,47 @@ All basic solver variants include two test cases for validation:

 # 2D solver variants
 ## Sequential solver (2D-seq)
-This is the basic sequential version.
+This is the basic sequential version. Gnuplot result visualization.

 ## Sequential solver with particle tracing (2D-seq-pt)
 This version adds particle tracing and streak lines to the sequential basic solver.
+Gnuplot result visualization.

 ## Simple MPI parallel solver (2D-mpi-v1)
 The simplest possible MPI parallelization with domain decomposition in one
 direction and communication just based on simple send and recv calls.
+Gnuplot result visualization.

 ## MPI parallel solver with 2D domain decomposition (2D-mpi-v2)
 A MPI parallelization with two-dimensional domain decomposition using MPI
 virtual topologies.
+Gnuplot result visualization.

 ## MPI parallel solver using MPI-3 neighborhood collectives (2D-mpi-v3)
 A MPI parallelization with two-dimensional domain decomposition using
 neighborhood collective call instead of send and recv calls.
+Gnuplot result visualization.

 ## Refactored MPI parallel solver (2D-mpi)
 The final version of the 2D MPI parallel solver. All MPI calls are contained in
 a single communication module. The rest of the code does not depend on MPI.
 This version is prepared to also compile and run without MPI.
+VTK result visualization.

 # 3D solver variants

 ## Sequential solver (3D-seq)
 This is the basic sequential version.
+VTK result visualization.

 ## MPI parallel solver (3D-mpi)
 A MPI parallel solver with 3D domain decomposition using MPI virtual topologies
 and neighborhood collectives. All MPI calls are contained in a single
 communication module. The rest of the code does not depend on MPI. This version
 is prepared to also compile and run without MPI.
+VTK result visualization.
+
+## MPI parallel solver with MPI-IO (3D-mpi-io)
+Identical to the 3D-MPI variant but using MPI-IO for VTK result file output.
+
+