Seuquential version finished. Start with MPI stuff
Does not compile.
This commit is contained in:
parent
3428d84c3d
commit
512e2903c8
2
Makefile
2
Makefile
@ -27,7 +27,7 @@ clist = $(subst $(eval) ,$c,$(strip $1))
|
||||
|
||||
define CLANGD_TEMPLATE
|
||||
CompileFlags:
|
||||
Add: [$(call clist,$(INCLUDES)), $(call clist,$(CPPFLAGS))]
|
||||
Add: [$(call clist,$(INCLUDES)), $(call clist,$(DEFINES)), $(call clist,$(OPTIONS)), -xc]
|
||||
Compiler: clang
|
||||
endef
|
||||
|
||||
|
@ -1,9 +1,11 @@
|
||||
# Supported: GCC, CLANG, ICC
|
||||
TOOLCHAIN ?= CLANG
|
||||
ENABLE_MPI ?= true
|
||||
# ENABLE_OPENMP ?= false
|
||||
|
||||
#Feature options
|
||||
OPTIONS += -DARRAY_ALIGNMENT=64
|
||||
#OPTIONS += -DVERBOSE
|
||||
#OPTIONS += -DVERBOSE_AFFINITY
|
||||
#OPTIONS += -DVERBOSE_DATASIZE
|
||||
#OPTIONS += -DVERBOSE_TIMER
|
||||
|
11
hpcg.par
Normal file
11
hpcg.par
Normal file
@ -0,0 +1,11 @@
|
||||
#==============================================================================
|
||||
# HPCG 27pt stencil
|
||||
#==============================================================================
|
||||
|
||||
filename generate
|
||||
nx 50
|
||||
ny 50
|
||||
nz 50
|
||||
|
||||
itermax 10000
|
||||
eps 0.0001
|
@ -1,7 +1,13 @@
|
||||
ifeq ($(strip $(ENABLE_MPI)),true)
|
||||
CC = mpicc
|
||||
DEFINES = -D_MPI
|
||||
else
|
||||
CC = clang
|
||||
endif
|
||||
|
||||
LD = $(CC)
|
||||
|
||||
ifeq ($(ENABLE_OPENMP),true)
|
||||
ifeq ($(strip $(ENABLE_OPENMP)),true)
|
||||
OPENMP = -fopenmp
|
||||
#OPENMP = -Xpreprocessor -fopenmp #required on Macos with homebrew libomp
|
||||
LIBS = # -lomp
|
||||
@ -11,5 +17,5 @@ VERSION = --version
|
||||
CFLAGS = -Ofast -std=c99 $(OPENMP)
|
||||
#CFLAGS = -Ofast -fnt-store=aggressive -std=c99 $(OPENMP) #AMD CLANG
|
||||
LFLAGS = $(OPENMP)
|
||||
DEFINES = -D_GNU_SOURCE
|
||||
INCLUDES = -I~/.local/include
|
||||
DEFINES += -D_GNU_SOURCE
|
||||
INCLUDES = -I/Users/jan/.local/include
|
||||
|
586
src/comm.c
586
src/comm.c
@ -1,14 +1,23 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
/* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of nusif-solver.
|
||||
* Use of this source code is governed by a MIT style
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include "util.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#ifdef _MPI
|
||||
#include <mpi.h>
|
||||
#endif
|
||||
|
||||
#include "allocate.h"
|
||||
#include "comm.h"
|
||||
|
||||
#define MAX_EXTERNAL 100000
|
||||
#define MAX_NUM_MESSAGES 500
|
||||
#define MAX_NUM_NEIGHBOURS MAX_NUM_MESSAGES
|
||||
|
||||
// subroutines local to this module
|
||||
int sizeOfRank(int rank, int size, int N)
|
||||
{
|
||||
@ -26,6 +35,528 @@ void commReduction(double* v, int op)
|
||||
#endif
|
||||
}
|
||||
|
||||
void commPartition(Comm* c, Matrix* A)
|
||||
{
|
||||
int rank = c->rank;
|
||||
int size = c->size;
|
||||
MPI_Comm comm = c->comm;
|
||||
|
||||
// Extract Matrix pieces
|
||||
int start_row = A->startRow;
|
||||
int stop_row = A->stopRow;
|
||||
int total_nrow = A->totalNr;
|
||||
long long total_nnz = A->totalNnz;
|
||||
int local_nrow = A->nr;
|
||||
int local_nnz = A->nnz;
|
||||
int* row_ptr = (int*)A->rowPtr;
|
||||
int* col_ind = (int*)A->colInd;
|
||||
|
||||
// int* nnz_in_row = A->nnz_in_row;
|
||||
// double** ptr_to_vals_in_row = A->ptr_to_vals_in_row;
|
||||
// int** ptr_to_inds_in_row = A->ptr_to_inds_in_row;
|
||||
|
||||
// We need to convert the index values for the rows on this processor
|
||||
// to a local index space. We need to:
|
||||
// - Determine if each index reaches to a local value or external value
|
||||
// - If local, subtract start_row from index value to get local index
|
||||
// - If external, find out if it is already accounted for.
|
||||
// - If so, then do nothing,
|
||||
// - otherwise
|
||||
// - add it to the list of external indices,
|
||||
// - find out which processor owns the value.
|
||||
// - Set up communication for sparse MV operation.
|
||||
|
||||
///////////////////////////////////////////
|
||||
// Scan the indices and transform to local
|
||||
///////////////////////////////////////////
|
||||
|
||||
int* external_index = (int*)allocate(ARRAY_ALIGNMENT, MAX_EXTERNAL * sizeof(int));
|
||||
int* externals = (int*)allocate(ARRAY_ALIGNMENT, A->totalNr * sizeof(int));
|
||||
int num_external = 1;
|
||||
|
||||
c->external_index = external_index;
|
||||
|
||||
for (int i = 0; i < A->totalNr; i++) {
|
||||
externals[i] = -1;
|
||||
}
|
||||
|
||||
for (int i = 0; i < local_nrow; i++) {
|
||||
for (int j = row_ptr[i]; j < row_ptr[i + 1]; j++) {
|
||||
int cur_ind = A->colInd[j];
|
||||
|
||||
#ifdef VERBOSE
|
||||
printf("Process %d of %d getting index %d in local row %d\n",
|
||||
rank,
|
||||
size,
|
||||
cur_ind,
|
||||
i);
|
||||
#endif
|
||||
|
||||
// shift local rows to the start
|
||||
if (start_row <= cur_ind && cur_ind <= stop_row) {
|
||||
col_ind[j] -= start_row;
|
||||
} else // Must find out if we have already set up this point
|
||||
{
|
||||
if (externals[cur_ind] == -1) {
|
||||
externals[cur_ind] = num_external++;
|
||||
|
||||
if (num_external <= MAX_EXTERNAL) {
|
||||
external_index[num_external - 1] = cur_ind;
|
||||
// Mark index as external by negating it
|
||||
ptr_to_inds_in_row[i][j] = -(ptr_to_inds_in_row[i][j] + 1);
|
||||
} else {
|
||||
cerr << "Must increase MAX_EXTERNAL in HPC_Sparse_Matrix.hpp"
|
||||
<< endl;
|
||||
abort();
|
||||
}
|
||||
} else {
|
||||
// Mark index as external by adding 1 and negating it
|
||||
ptr_to_inds_in_row[i][j] = -(ptr_to_inds_in_row[i][j] + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Go through list of externals to find out which processors must be accessed.
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
A->num_external = num_external;
|
||||
int* tmp_buffer = new int[size]; // Temp buffer space needed below
|
||||
|
||||
// Build list of global index offset
|
||||
|
||||
int* global_index_offsets = new int[size];
|
||||
for (i = 0; i < size; i++)
|
||||
tmp_buffer[i] = 0; // First zero out
|
||||
|
||||
tmp_buffer[rank] = start_row; // This is my start row
|
||||
|
||||
// This call sends the start_row of each ith processor to the ith
|
||||
// entry of global_index_offset on all processors.
|
||||
// Thus, each processor know the range of indices owned by all
|
||||
// other processors.
|
||||
// Note: There might be a better algorithm for doing this, but this
|
||||
// will work...
|
||||
|
||||
MPI_Allreduce(tmp_buffer,
|
||||
global_index_offsets,
|
||||
size,
|
||||
MPI_INT,
|
||||
MPI_SUM,
|
||||
MPI_COMM_WORLD);
|
||||
|
||||
// Go through list of externals and find the processor that owns each
|
||||
int* external_processor = new int[num_external];
|
||||
int* new_external_processor = new int[num_external];
|
||||
|
||||
for (i = 0; i < num_external; i++) {
|
||||
int cur_ind = external_index[i];
|
||||
for (int j = size - 1; j >= 0; j--)
|
||||
if (global_index_offsets[j] <= cur_ind) {
|
||||
external_processor[i] = j;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (debug) {
|
||||
t0 = mytimer() - t0;
|
||||
cout << " Time in finding processors phase = " << t0 << endl;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Sift through the external elements. For each newly encountered external
|
||||
// point assign it the next index in the sequence. Then look for other
|
||||
// external elements who are update by the same node and assign them the next
|
||||
// set of index numbers in the sequence (ie. elements updated by the same node
|
||||
// have consecutive indices).
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
if (debug) t0 = mytimer();
|
||||
|
||||
int count = local_nrow;
|
||||
for (i = 0; i < num_external; i++)
|
||||
external_local_index[i] = -1;
|
||||
|
||||
for (i = 0; i < num_external; i++) {
|
||||
if (external_local_index[i] == -1) {
|
||||
external_local_index[i] = count++;
|
||||
|
||||
for (j = i + 1; j < num_external; j++) {
|
||||
if (external_processor[j] == external_processor[i])
|
||||
external_local_index[j] = count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (debug) {
|
||||
t0 = mytimer() - t0;
|
||||
cout << " Time in scanning external indices phase = " << t0 << endl;
|
||||
}
|
||||
if (debug) t0 = mytimer();
|
||||
|
||||
for (i = 0; i < local_nrow; i++) {
|
||||
for (j = 0; j < nnz_in_row[i]; j++) {
|
||||
if (ptr_to_inds_in_row[i][j] < 0) // Change index values of externals
|
||||
{
|
||||
int cur_ind = -ptr_to_inds_in_row[i][j] - 1;
|
||||
ptr_to_inds_in_row[i][j] = external_local_index[externals[cur_ind]];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < num_external; i++)
|
||||
new_external_processor[i] = 0;
|
||||
|
||||
for (i = 0; i < num_external; i++)
|
||||
new_external_processor[external_local_index[i] - local_nrow] =
|
||||
external_processor[i];
|
||||
|
||||
if (debug) {
|
||||
t0 = mytimer() - t0;
|
||||
cout << " Time in assigning external indices phase = " << t0 << endl;
|
||||
}
|
||||
|
||||
if (debug_details) {
|
||||
for (i = 0; i < num_external; i++) {
|
||||
cout << "Processor " << rank << " of " << size << ": external processor[" << i
|
||||
<< "] = " << external_processor[i] << endl;
|
||||
cout << "Processor " << rank << " of " << size << ": new external processor["
|
||||
<< i << "] = " << new_external_processor[i] << endl;
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
///
|
||||
// Count the number of neighbors from which we receive information to update
|
||||
// our external elements. Additionally, fill the array tmp_neighbors in the
|
||||
// following way:
|
||||
// tmp_neighbors[i] = 0 ==> No external elements are updated by
|
||||
// processor i.
|
||||
// tmp_neighbors[i] = x ==> (x-1)/size elements are updated from
|
||||
// processor i.
|
||||
///
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
t0 = mytimer();
|
||||
int* tmp_neighbors = new int[size];
|
||||
for (i = 0; i < size; i++)
|
||||
tmp_neighbors[i] = 0;
|
||||
|
||||
int num_recv_neighbors = 0;
|
||||
int length = 1;
|
||||
|
||||
for (i = 0; i < num_external; i++) {
|
||||
if (tmp_neighbors[new_external_processor[i]] == 0) {
|
||||
num_recv_neighbors++;
|
||||
tmp_neighbors[new_external_processor[i]] = 1;
|
||||
}
|
||||
tmp_neighbors[new_external_processor[i]] += size;
|
||||
}
|
||||
|
||||
/// sum over all processors all the tmp_neighbors arrays ///
|
||||
|
||||
MPI_Allreduce(tmp_neighbors, tmp_buffer, size, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
|
||||
|
||||
/// decode the combined 'tmp_neighbors' (stored in tmp_buffer)
|
||||
// array from all the processors
|
||||
|
||||
int num_send_neighbors = tmp_buffer[rank] % size;
|
||||
|
||||
/// decode 'tmp_buffer[rank] to deduce total number of elements
|
||||
// we must send
|
||||
|
||||
int total_to_be_sent = (tmp_buffer[rank] - num_send_neighbors) / size;
|
||||
|
||||
//
|
||||
// Check to see if we have enough workspace allocated. This could be
|
||||
// dynamically modified, but let's keep it simple for now...
|
||||
//
|
||||
|
||||
if (num_send_neighbors > MAX_NUM_MESSAGES) {
|
||||
cerr << "Must increase MAX_NUM_MESSAGES in HPC_Sparse_Matrix.hpp" << endl;
|
||||
cerr << "Must be at least " << num_send_neighbors << endl;
|
||||
abort();
|
||||
}
|
||||
|
||||
if (total_to_be_sent > MAX_EXTERNAL) {
|
||||
cerr << "Must increase MAX_EXTERNAL in HPC_Sparse_Matrix.hpp" << endl;
|
||||
cerr << "Must be at least " << total_to_be_sent << endl;
|
||||
abort();
|
||||
}
|
||||
delete[] tmp_neighbors;
|
||||
|
||||
if (debug) {
|
||||
t0 = mytimer() - t0;
|
||||
cout << " Time in finding neighbors phase = " << t0 << endl;
|
||||
}
|
||||
if (debug)
|
||||
cout << "Processor " << rank << " of " << size
|
||||
<< ": Number of send neighbors = " << num_send_neighbors << endl;
|
||||
|
||||
if (debug)
|
||||
cout << "Processor " << rank << " of " << size
|
||||
<< ": Number of receive neighbors = " << num_recv_neighbors << endl;
|
||||
|
||||
if (debug)
|
||||
cout << "Processor " << rank << " of " << size
|
||||
<< ": Total number of elements to send = " << total_to_be_sent << endl;
|
||||
|
||||
if (debug) MPI_Barrier(MPI_COMM_WORLD);
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
///
|
||||
// Make a list of the neighbors that will send information to update our
|
||||
// external elements (in the order that we will receive this information).
|
||||
///
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
|
||||
int* recv_list = new int[MAX_EXTERNAL];
|
||||
|
||||
j = 0;
|
||||
recv_list[j++] = new_external_processor[0];
|
||||
for (i = 1; i < num_external; i++) {
|
||||
if (new_external_processor[i - 1] != new_external_processor[i]) {
|
||||
recv_list[j++] = new_external_processor[i];
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// Send a 0 length message to each of our recv neighbors
|
||||
//
|
||||
|
||||
int* send_list = new int[num_send_neighbors];
|
||||
for (i = 0; i < num_send_neighbors; i++)
|
||||
send_list[i] = 0;
|
||||
|
||||
//
|
||||
// first post receives, these are immediate receives
|
||||
// Do not wait for result to come, will do that at the
|
||||
// wait call below.
|
||||
//
|
||||
int MPI_MY_TAG = 99;
|
||||
|
||||
MPI_Request* request = new MPI_Request[MAX_NUM_MESSAGES];
|
||||
for (i = 0; i < num_send_neighbors; i++) {
|
||||
MPI_Irecv(tmp_buffer + i,
|
||||
1,
|
||||
MPI_INT,
|
||||
MPI_ANY_SOURCE,
|
||||
MPI_MY_TAG,
|
||||
MPI_COMM_WORLD,
|
||||
request + i);
|
||||
}
|
||||
|
||||
// send messages
|
||||
|
||||
for (i = 0; i < num_recv_neighbors; i++)
|
||||
MPI_Send(tmp_buffer + i, 1, MPI_INT, recv_list[i], MPI_MY_TAG, MPI_COMM_WORLD);
|
||||
///
|
||||
// Receive message from each send neighbor to construct 'send_list'.
|
||||
///
|
||||
|
||||
MPI_Status status;
|
||||
for (i = 0; i < num_send_neighbors; i++) {
|
||||
if (MPI_Wait(request + i, &status)) {
|
||||
cerr << "MPI_Wait error\n" << endl;
|
||||
exit(-1);
|
||||
}
|
||||
send_list[i] = status.MPI_SOURCE;
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
///
|
||||
// Compare the two lists. In most cases they should be the same.
|
||||
// However, if they are not then add new entries to the recv list
|
||||
// that are in the send list (but not already in the recv list).
|
||||
///
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
|
||||
for (j = 0; j < num_send_neighbors; j++) {
|
||||
int found = 0;
|
||||
for (i = 0; i < num_recv_neighbors; i++) {
|
||||
if (recv_list[i] == send_list[j]) found = 1;
|
||||
}
|
||||
|
||||
if (found == 0) {
|
||||
if (debug)
|
||||
cout << "Processor " << rank << " of " << size << ": recv_list["
|
||||
<< num_recv_neighbors << "] = " << send_list[j] << endl;
|
||||
recv_list[num_recv_neighbors] = send_list[j];
|
||||
(num_recv_neighbors)++;
|
||||
}
|
||||
}
|
||||
|
||||
delete[] send_list;
|
||||
num_send_neighbors = num_recv_neighbors;
|
||||
|
||||
if (num_send_neighbors > MAX_NUM_MESSAGES) {
|
||||
cerr << "Must increase MAX_EXTERNAL in HPC_Sparse_Matrix.hpp" << endl;
|
||||
abort();
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
/// Start filling HPC_Sparse_Matrix struct
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
|
||||
A->total_to_be_sent = total_to_be_sent;
|
||||
int* elements_to_send = new int[total_to_be_sent];
|
||||
A->elements_to_send = elements_to_send;
|
||||
|
||||
for (i = 0; i < total_to_be_sent; i++)
|
||||
elements_to_send[i] = 0;
|
||||
|
||||
//
|
||||
// Create 'new_external' which explicitly put the external elements in the
|
||||
// order given by 'external_local_index'
|
||||
//
|
||||
|
||||
int* new_external = new int[num_external];
|
||||
for (i = 0; i < num_external; i++) {
|
||||
new_external[external_local_index[i] - local_nrow] = external_index[i];
|
||||
}
|
||||
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Send each processor the global index list of the external elements in the
|
||||
// order that I will want to receive them when updating my external elements
|
||||
//
|
||||
/////////////////////////////////////////////////////////////////////////
|
||||
|
||||
int* lengths = new int[num_recv_neighbors];
|
||||
|
||||
MPI_MY_TAG++;
|
||||
|
||||
// First post receives
|
||||
|
||||
for (i = 0; i < num_recv_neighbors; i++) {
|
||||
int partner = recv_list[i];
|
||||
MPI_Irecv(lengths + i,
|
||||
1,
|
||||
MPI_INT,
|
||||
partner,
|
||||
MPI_MY_TAG,
|
||||
MPI_COMM_WORLD,
|
||||
request + i);
|
||||
}
|
||||
|
||||
int* neighbors = new int[MAX_NUM_NEIGHBOURS];
|
||||
int* recv_length = new int[MAX_NUM_NEIGHBOURS];
|
||||
int* send_length = new int[MAX_NUM_NEIGHBOURS];
|
||||
|
||||
A->neighbors = neighbors;
|
||||
A->recv_length = recv_length;
|
||||
A->send_length = send_length;
|
||||
|
||||
j = 0;
|
||||
for (i = 0; i < num_recv_neighbors; i++) {
|
||||
int start = j;
|
||||
int newlength = 0;
|
||||
|
||||
// go through list of external elements until updating
|
||||
// processor changes
|
||||
|
||||
while ((j < num_external) && (new_external_processor[j] == recv_list[i])) {
|
||||
newlength++;
|
||||
j++;
|
||||
if (j == num_external) break;
|
||||
}
|
||||
|
||||
recv_length[i] = newlength;
|
||||
neighbors[i] = recv_list[i];
|
||||
|
||||
length = j - start;
|
||||
MPI_Send(&length, 1, MPI_INT, recv_list[i], MPI_MY_TAG, MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
// Complete the receives of the number of externals
|
||||
|
||||
for (i = 0; i < num_recv_neighbors; i++) {
|
||||
if (MPI_Wait(request + i, &status)) {
|
||||
cerr << "MPI_Wait error\n" << endl;
|
||||
exit(-1);
|
||||
}
|
||||
send_length[i] = lengths[i];
|
||||
}
|
||||
delete[] lengths;
|
||||
|
||||
///////////////////////////////////////////////////////////////////
|
||||
// Build "elements_to_send" list. These are the x elements I own
|
||||
// that need to be sent to other processors.
|
||||
///////////////////////////////////////////////////////////////////
|
||||
|
||||
MPI_MY_TAG++;
|
||||
|
||||
j = 0;
|
||||
for (i = 0; i < num_recv_neighbors; i++) {
|
||||
MPI_Irecv(elements_to_send + j,
|
||||
send_length[i],
|
||||
MPI_INT,
|
||||
neighbors[i],
|
||||
MPI_MY_TAG,
|
||||
MPI_COMM_WORLD,
|
||||
request + i);
|
||||
j += send_length[i];
|
||||
}
|
||||
|
||||
j = 0;
|
||||
for (i = 0; i < num_recv_neighbors; i++) {
|
||||
int start = j;
|
||||
int newlength = 0;
|
||||
|
||||
// Go through list of external elements
|
||||
// until updating processor changes. This is redundant, but
|
||||
// saves us from recording this information.
|
||||
|
||||
while ((j < num_external) && (new_external_processor[j] == recv_list[i])) {
|
||||
|
||||
newlength++;
|
||||
j++;
|
||||
if (j == num_external) break;
|
||||
}
|
||||
MPI_Send(new_external + start,
|
||||
j - start,
|
||||
MPI_INT,
|
||||
recv_list[i],
|
||||
MPI_MY_TAG,
|
||||
MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
// receive from each neighbor the global index list of external elements
|
||||
|
||||
for (i = 0; i < num_recv_neighbors; i++) {
|
||||
if (MPI_Wait(request + i, &status)) {
|
||||
cerr << "MPI_Wait error\n" << endl;
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
/// replace global indices by local indices ///
|
||||
|
||||
for (i = 0; i < total_to_be_sent; i++)
|
||||
elements_to_send[i] -= start_row;
|
||||
|
||||
////////////////
|
||||
// Finish up !!
|
||||
////////////////
|
||||
|
||||
A->num_send_neighbors = num_send_neighbors;
|
||||
A->local_ncol = A->local_nrow + num_external;
|
||||
|
||||
// Used in exchange_externals
|
||||
double* send_buffer = new double[total_to_be_sent];
|
||||
A->send_buffer = send_buffer;
|
||||
|
||||
delete[] tmp_buffer;
|
||||
delete[] global_index_offsets;
|
||||
delete[] recv_list;
|
||||
delete[] external_processor;
|
||||
delete[] new_external;
|
||||
delete[] new_external_processor;
|
||||
delete[] request;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void commPrintConfig(Comm* c)
|
||||
{
|
||||
#ifdef _MPI
|
||||
@ -70,57 +601,6 @@ void commInit(Comm* c, int argc, char** argv)
|
||||
#endif
|
||||
}
|
||||
|
||||
void commTestInit(Comm* c, double* p, double* f, double* g)
|
||||
{
|
||||
int imax = c->imaxLocal;
|
||||
int jmax = c->jmaxLocal;
|
||||
int rank = c->rank;
|
||||
|
||||
for (int j = 0; j < jmax + 2; j++) {
|
||||
for (int i = 0; i < imax + 2; i++) {
|
||||
p[j * (imax + 2) + i] = rank;
|
||||
f[j * (imax + 2) + i] = rank;
|
||||
g[j * (imax + 2) + i] = rank;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void testWriteFile(char* filename, double* grid, int imax, int jmax)
|
||||
{
|
||||
FILE* fp = fopen(filename, "w");
|
||||
|
||||
if (fp == NULL) {
|
||||
printf("Error!\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
for (int j = 0; j < jmax + 2; j++) {
|
||||
for (int i = 0; i < imax + 2; i++) {
|
||||
fprintf(fp, "%.2f ", grid[j * (imax + 2) + i]);
|
||||
}
|
||||
fprintf(fp, "\n");
|
||||
}
|
||||
|
||||
fclose(fp);
|
||||
}
|
||||
|
||||
void commTestWrite(Comm* c, double* p, double* f, double* g)
|
||||
{
|
||||
int imax = c->imaxLocal;
|
||||
int jmax = c->jmaxLocal;
|
||||
int rank = c->rank;
|
||||
|
||||
char filename[30];
|
||||
snprintf(filename, 30, "ptest-%d.dat", rank);
|
||||
testWriteFile(filename, p, imax, jmax);
|
||||
|
||||
snprintf(filename, 30, "ftest-%d.dat", rank);
|
||||
testWriteFile(filename, f, imax, jmax);
|
||||
|
||||
snprintf(filename, 30, "gtest-%d.dat", rank);
|
||||
testWriteFile(filename, g, imax, jmax);
|
||||
}
|
||||
|
||||
void commFinalize(Comm* c)
|
||||
{
|
||||
#ifdef _MPI
|
||||
|
40
src/comm.h
40
src/comm.h
@ -8,10 +8,8 @@
|
||||
#include <mpi.h>
|
||||
#endif
|
||||
|
||||
enum direction { LEFT = 0, RIGHT, BOTTOM, TOP, NDIRS };
|
||||
enum dimension { IDIM = 0, JDIM, NDIMS };
|
||||
enum cdimension { CJDIM = 0, CIDIM };
|
||||
enum layer { HALO = 0, BULK };
|
||||
#include "matrix.h"
|
||||
|
||||
enum op { MAX = 0, SUM };
|
||||
|
||||
typedef struct {
|
||||
@ -19,37 +17,27 @@ typedef struct {
|
||||
int size;
|
||||
#if defined(_MPI)
|
||||
MPI_Comm comm;
|
||||
MPI_Datatype bufferTypes[NDIRS];
|
||||
MPI_Aint sdispls[NDIRS];
|
||||
MPI_Aint rdispls[NDIRS];
|
||||
|
||||
int num_external;
|
||||
int num_send_neighbors;
|
||||
int* external_index;
|
||||
int* external_local_index;
|
||||
int total_to_be_sent;
|
||||
int* elements_to_send;
|
||||
int* neighbors;
|
||||
int* recv_length;
|
||||
int* send_length;
|
||||
double* send_buffer;
|
||||
#endif
|
||||
int neighbours[NDIRS];
|
||||
int coords[NDIMS], dims[NDIMS];
|
||||
int imaxLocal, jmaxLocal;
|
||||
} Comm;
|
||||
|
||||
extern int sizeOfRank(int rank, int size, int N);
|
||||
extern void commInit(Comm* c, int argc, char** argv);
|
||||
extern void commTestInit(Comm* c, double* p, double* f, double* g);
|
||||
extern void commTestWrite(Comm* c, double* p, double* f, double* g);
|
||||
extern void commFinalize(Comm* c);
|
||||
extern void commPartition(Comm* c, int jmax, int imax);
|
||||
extern void commPartition(Comm*, Matrix* m);
|
||||
extern void commPrintConfig(Comm*);
|
||||
extern void commExchange(Comm*, double*);
|
||||
extern void commShift(Comm* c, double* f, double* g);
|
||||
extern void commReduction(double* v, int op);
|
||||
extern int commIsBoundary(Comm* c, int direction);
|
||||
extern void commUpdateDatatypes(Comm*, Comm*, int, int);
|
||||
extern void commFreeCommunicator(Comm*);
|
||||
extern void commCollectResult(Comm* c,
|
||||
double* ug,
|
||||
double* vg,
|
||||
double* pg,
|
||||
double* u,
|
||||
double* v,
|
||||
double* p,
|
||||
int jmax,
|
||||
int imax);
|
||||
|
||||
static inline int commIsMaster(Comm* c) { return c->rank == 0; }
|
||||
#endif // __COMM_H_
|
||||
|
108
src/main.c
108
src/main.c
@ -2,61 +2,111 @@
|
||||
* All rights reserved.
|
||||
* Use of this source code is governed by a MIT-style
|
||||
* license that can be found in the LICENSE file.*/
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include "allocate.h"
|
||||
#include "comm.h"
|
||||
#include "matrix.h"
|
||||
#include "progress.h"
|
||||
#include "parameter.h"
|
||||
#include "solver.h"
|
||||
#include "timing.h"
|
||||
#include "util.h"
|
||||
|
||||
CG_FLOAT compute_residual(Solver* s)
|
||||
{
|
||||
CG_FLOAT residual = 0.0;
|
||||
int n = s->A.nr;
|
||||
CG_FLOAT* v1 = s->x;
|
||||
CG_FLOAT* v2 = s->xexact;
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
double diff = fabs(v1[i] - v2[i]);
|
||||
if (diff > residual) residual = diff;
|
||||
}
|
||||
|
||||
commReduction(&residual, MAX);
|
||||
|
||||
return residual;
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
int rank;
|
||||
double timeStart, timeStop;
|
||||
Matrix m;
|
||||
Parameter param;
|
||||
Solver s;
|
||||
Comm comm;
|
||||
|
||||
// commInit(s.comm, argc, argv);
|
||||
|
||||
// FILE* fp;
|
||||
// if (commIsMaster(s.comm)) fp = initResidualWriter();
|
||||
commInit(&comm, argc, argv);
|
||||
initParameter(¶m);
|
||||
|
||||
if (argc != 2) {
|
||||
printf("Usage: %s <configFile>\n", argv[0]);
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
matrixRead(&m, argv[1]);
|
||||
matrixDump(&m);
|
||||
exit(EXIT_SUCCESS);
|
||||
readParameter(¶m, argv[1]);
|
||||
|
||||
double te = 1000;
|
||||
double t = 0.0;
|
||||
double res = 0.0;
|
||||
CG_FLOAT eps = (CG_FLOAT)param.eps;
|
||||
int itermax = param.itermax;
|
||||
initSolver(&s, ¶m);
|
||||
// matrixDump(&s.A);
|
||||
|
||||
CG_UINT nrow = s.A.nr;
|
||||
CG_UINT ncol = s.A.nc;
|
||||
CG_FLOAT* r = (CG_FLOAT*)allocate(64, nrow * sizeof(CG_FLOAT));
|
||||
CG_FLOAT* p = (CG_FLOAT*)allocate(64, ncol * sizeof(CG_FLOAT));
|
||||
CG_FLOAT* Ap = (CG_FLOAT*)allocate(64, nrow * sizeof(CG_FLOAT));
|
||||
CG_FLOAT normr = 0.0;
|
||||
CG_FLOAT rtrans = 0.0, oldrtrans;
|
||||
|
||||
waxpby(nrow, 1.0, s.x, 0.0, s.x, p);
|
||||
spMVM(&s.A, p, Ap);
|
||||
waxpby(nrow, 1.0, s.b, -1.0, Ap, r);
|
||||
ddot(nrow, r, r, &rtrans);
|
||||
|
||||
normr = sqrt(rtrans);
|
||||
|
||||
// initial iteration
|
||||
waxpby(nrow, 1.0, r, 0.0, r, p);
|
||||
|
||||
// TICK(); exchange_externals(A,p); TOCK(t5);
|
||||
spMVM(&s.A, p, Ap);
|
||||
double alpha = 0.0;
|
||||
ddot(nrow, p, Ap, &alpha);
|
||||
alpha = rtrans / alpha;
|
||||
waxpby(nrow, 1.0, s.x, alpha, p, s.x);
|
||||
waxpby(nrow, 1.0, r, -alpha, Ap, r);
|
||||
|
||||
int k;
|
||||
timeStart = getTimeStamp();
|
||||
while (t <= te) {
|
||||
|
||||
// if (commIsMaster(s.comm)) writeResidual(fp, t, res);
|
||||
|
||||
t += 0.5;
|
||||
|
||||
#ifdef VERBOSE
|
||||
if (commIsMaster(s.comm)) {
|
||||
printf("TIME %f , TIMESTEP %f\n", t, d.dt);
|
||||
}
|
||||
#else
|
||||
printProgress(t);
|
||||
#endif
|
||||
for (k = 1; k < itermax && normr > eps; k++) {
|
||||
oldrtrans = rtrans;
|
||||
ddot(nrow, r, r, &rtrans);
|
||||
double beta = rtrans / oldrtrans;
|
||||
waxpby(nrow, 1.0, r, beta, p, p);
|
||||
// TICK(); exchange_externals(A,p); TOCK(t5);
|
||||
spMVM(&s.A, p, Ap);
|
||||
alpha = 0.0;
|
||||
ddot(nrow, p, Ap, &alpha);
|
||||
alpha = rtrans / alpha;
|
||||
waxpby(nrow, 1.0, s.x, alpha, p, s.x);
|
||||
waxpby(nrow, 1.0, r, -alpha, Ap, r);
|
||||
}
|
||||
timeStop = getTimeStamp();
|
||||
if (commIsMaster(s.comm)) {
|
||||
printf("Solution took %.2fs\n", timeStop - timeStart);
|
||||
|
||||
double residual = compute_residual(&s);
|
||||
|
||||
if (commIsMaster(&comm)) {
|
||||
printf("Solution performed %d iterations and took %.2fs\n",
|
||||
k,
|
||||
timeStop - timeStart);
|
||||
printf("Difference between computed and exact = %f\n", residual);
|
||||
}
|
||||
// if (commIsMaster(s.comm)) fclose(fp);
|
||||
commFinalize(s.comm);
|
||||
|
||||
commFinalize(&comm);
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
@ -94,7 +94,7 @@ void matrixRead(Matrix* m, char* filename)
|
||||
Entry* mm;
|
||||
|
||||
if (sym_flag) {
|
||||
mm = (Entry*)allocate(64, nz * 2 * sizeof(Entry));
|
||||
mm = (Entry*)allocate(ARRAY_ALIGNMENT, nz * 2 * sizeof(Entry));
|
||||
} else {
|
||||
mm = (Entry*)allocate(64, nz * sizeof(Entry));
|
||||
}
|
||||
|
@ -4,6 +4,7 @@
|
||||
* license that can be found in the LICENSE file.*/
|
||||
#ifndef __MATRIX_H_
|
||||
#define __MATRIX_H_
|
||||
#include <stdbool.h>
|
||||
|
||||
#include "util.h"
|
||||
|
||||
@ -14,7 +15,9 @@ typedef struct {
|
||||
} Entry;
|
||||
|
||||
typedef struct {
|
||||
CG_UINT nr, nnz; // number of rows and non zeros
|
||||
CG_UINT nr, nc, nnz; // number of rows, columns and non zeros
|
||||
CG_UINT totalNr, totalNnz; // number of rows and non zeros
|
||||
CG_UINT startRow, stopRow;
|
||||
CG_UINT *colInd, *rowPtr; // colum Indices, row Pointer
|
||||
CG_FLOAT* val;
|
||||
} Matrix;
|
||||
|
@ -1,9 +1,7 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
/* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of nusif-solver.
|
||||
* Use of this source code is governed by a MIT style
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
* license that can be found in the LICENSE file. */
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
@ -14,16 +12,12 @@
|
||||
|
||||
void initParameter(Parameter* param)
|
||||
{
|
||||
param->xlength = 1.0;
|
||||
param->ylength = 1.0;
|
||||
param->imax = 100;
|
||||
param->jmax = 100;
|
||||
param->filename = "generate";
|
||||
param->nx = 100;
|
||||
param->ny = 100;
|
||||
param->nz = 1000;
|
||||
param->itermax = 1000;
|
||||
param->eps = 0.0001;
|
||||
param->omg = 1.8;
|
||||
param->levels = 5;
|
||||
param->presmooth = 5;
|
||||
param->postsmooth = 5;
|
||||
}
|
||||
|
||||
void readParameter(Parameter* param, const char* filename)
|
||||
@ -56,31 +50,11 @@ void readParameter(Parameter* param, const char* filename)
|
||||
#define PARSE_REAL(p) PARSE_PARAM(p, atof)
|
||||
|
||||
if (tok != NULL && val != NULL) {
|
||||
PARSE_REAL(xlength);
|
||||
PARSE_REAL(ylength);
|
||||
PARSE_INT(imax);
|
||||
PARSE_INT(jmax);
|
||||
PARSE_INT(nx);
|
||||
PARSE_INT(ny);
|
||||
PARSE_INT(nz);
|
||||
PARSE_INT(itermax);
|
||||
PARSE_REAL(eps);
|
||||
PARSE_REAL(omg);
|
||||
PARSE_REAL(re);
|
||||
PARSE_REAL(tau);
|
||||
PARSE_REAL(gamma);
|
||||
PARSE_REAL(dt);
|
||||
PARSE_REAL(te);
|
||||
PARSE_REAL(gx);
|
||||
PARSE_REAL(gy);
|
||||
PARSE_STRING(name);
|
||||
PARSE_INT(bcLeft);
|
||||
PARSE_INT(bcRight);
|
||||
PARSE_INT(bcBottom);
|
||||
PARSE_INT(bcTop);
|
||||
PARSE_INT(levels);
|
||||
PARSE_INT(presmooth);
|
||||
PARSE_INT(postsmooth);
|
||||
PARSE_REAL(u_init);
|
||||
PARSE_REAL(v_init);
|
||||
PARSE_REAL(p_init);
|
||||
}
|
||||
}
|
||||
|
||||
@ -89,26 +63,10 @@ void readParameter(Parameter* param, const char* filename)
|
||||
|
||||
void printParameter(Parameter* param)
|
||||
{
|
||||
printf("Parameters for %s\n", param->name);
|
||||
printf("Boundary conditions Left:%d Right:%d Bottom:%d Top:%d\n",
|
||||
param->bcLeft,
|
||||
param->bcRight,
|
||||
param->bcBottom,
|
||||
param->bcTop);
|
||||
printf("\tReynolds number: %.2f\n", param->re);
|
||||
printf("\tInit arrays: U:%.2f V:%.2f P:%.2f\n",
|
||||
param->u_init,
|
||||
param->v_init,
|
||||
param->p_init);
|
||||
printf("Parameters\n");
|
||||
printf("Geometry data:\n");
|
||||
printf("\tDomain box size (x, y): %.2f, %.2f\n", param->xlength, param->ylength);
|
||||
printf("\tCells (x, y): %d, %d\n", param->imax, param->jmax);
|
||||
printf("Timestep parameters:\n");
|
||||
printf("\tDefault stepsize: %.2f, Final time %.2f\n", param->dt, param->te);
|
||||
printf("\tTau factor: %.2f\n", param->tau);
|
||||
printf("\tPoints (x, y, z): %d, %d, %d\n", param->nx, param->ny, param->nz);
|
||||
printf("Iterative solver parameters:\n");
|
||||
printf("\tMax iterations: %d\n", param->itermax);
|
||||
printf("\tepsilon (stopping tolerance) : %f\n", param->eps);
|
||||
printf("\tgamma (stopping tolerance) : %f\n", param->gamma);
|
||||
printf("\tomega (SOR relaxation): %f\n", param->omg);
|
||||
}
|
||||
|
@ -1,24 +1,15 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
/* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of nusif-solver.
|
||||
* Use of this source code is governed by a MIT style
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
* license that can be found in the LICENSE file. */
|
||||
#ifndef __PARAMETER_H_
|
||||
#define __PARAMETER_H_
|
||||
|
||||
typedef struct {
|
||||
double xlength, ylength;
|
||||
int imax, jmax;
|
||||
char* filename;
|
||||
int nx, ny, nz;
|
||||
int itermax;
|
||||
double eps, omg;
|
||||
double re, tau, gamma;
|
||||
double te, dt;
|
||||
double gx, gy;
|
||||
char* name;
|
||||
int bcLeft, bcRight, bcBottom, bcTop;
|
||||
double u_init, v_init, p_init;
|
||||
int levels, presmooth, postsmooth;
|
||||
double eps;
|
||||
} Parameter;
|
||||
|
||||
void initParameter(Parameter*);
|
||||
|
@ -1,15 +1,12 @@
|
||||
/*
|
||||
* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
/* Copyright (C) NHR@FAU, University Erlangen-Nuremberg.
|
||||
* All rights reserved. This file is part of nusif-solver.
|
||||
* Use of this source code is governed by a MIT style
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
#include <math.h>
|
||||
#include <mpi.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
* license that can be found in the LICENSE file. */
|
||||
#include "progress.h"
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
static double _end;
|
||||
static int _current;
|
||||
|
179
src/solver.c
179
src/solver.c
@ -2,13 +2,147 @@
|
||||
* All rights reserved.
|
||||
* Use of this source code is governed by a MIT-style
|
||||
* license that can be found in the LICENSE file.*/
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "allocate.h"
|
||||
#include "comm.h"
|
||||
#include "matrix.h"
|
||||
#include "solver.h"
|
||||
#include "util.h"
|
||||
|
||||
void spMVM(Matrix* m, double* x, double* y)
|
||||
static void matrixGenerate(Parameter* p, Solver* s, bool use_7pt_stencil)
|
||||
{
|
||||
int size = 1; // Serial case (not using MPI)
|
||||
int rank = 0;
|
||||
|
||||
CG_UINT local_nrow = p->nx * p->ny * p->nz;
|
||||
CG_UINT local_nnz = 27 * local_nrow;
|
||||
|
||||
CG_UINT total_nrow = local_nrow * size;
|
||||
CG_UINT total_nnz = 27 * total_nrow;
|
||||
|
||||
int start_row = local_nrow * rank;
|
||||
int stop_row = start_row + local_nrow - 1;
|
||||
|
||||
if (use_7pt_stencil) {
|
||||
printf("Generate 7pt matrix with ");
|
||||
} else {
|
||||
printf("Generate 27pt matrix with ");
|
||||
}
|
||||
printf("%d total rows and %d nonzeros\n", (int)total_nrow, (int)local_nnz);
|
||||
|
||||
s->A.val = (CG_FLOAT*)allocate(64, local_nnz * sizeof(CG_FLOAT));
|
||||
s->A.colInd = (CG_UINT*)allocate(64, local_nnz * sizeof(CG_UINT));
|
||||
s->A.rowPtr = (CG_UINT*)allocate(64, (local_nrow + 1) * sizeof(CG_UINT));
|
||||
s->x = (CG_FLOAT*)allocate(64, local_nrow * sizeof(CG_FLOAT));
|
||||
s->b = (CG_FLOAT*)allocate(64, local_nrow * sizeof(CG_FLOAT));
|
||||
s->xexact = (CG_FLOAT*)allocate(64, local_nrow * sizeof(CG_FLOAT));
|
||||
|
||||
CG_FLOAT* curvalptr = s->A.val;
|
||||
CG_UINT* curindptr = s->A.colInd;
|
||||
CG_UINT* currowptr = s->A.rowPtr;
|
||||
CG_FLOAT* x = s->x;
|
||||
CG_FLOAT* b = s->b;
|
||||
CG_FLOAT* xexact = s->xexact;
|
||||
|
||||
CG_UINT nnzglobal = 0;
|
||||
int nx = p->nx, ny = p->ny, nz = p->nz;
|
||||
CG_UINT cursor = 0;
|
||||
|
||||
// for (int i = 0; i < local_nnz; i++) {
|
||||
// curvalptr[i] = 0.0;
|
||||
// printf("%d-%f, ", i, m->val[i]);
|
||||
// }
|
||||
// printf("\n");
|
||||
*currowptr++ = 0;
|
||||
|
||||
for (int iz = 0; iz < nz; iz++) {
|
||||
for (int iy = 0; iy < ny; iy++) {
|
||||
for (int ix = 0; ix < nx; ix++) {
|
||||
|
||||
int curlocalrow = iz * nx * ny + iy * nx + ix;
|
||||
int currow = start_row + iz * nx * ny + iy * nx + ix;
|
||||
int nnzrow = 0;
|
||||
|
||||
// (*A)->ptr_to_vals_in_row[curlocalrow] = curvalptr;
|
||||
// (*A)->ptr_to_inds_in_row[curlocalrow] = curindptr;
|
||||
|
||||
for (int sz = -1; sz <= 1; sz++) {
|
||||
for (int sy = -1; sy <= 1; sy++) {
|
||||
for (int sx = -1; sx <= 1; sx++) {
|
||||
|
||||
int curcol = currow + sz * nx * ny + sy * nx + sx;
|
||||
// Since we have a stack of nx by ny by nz domains
|
||||
// , stacking in the z direction, we check to see
|
||||
// if sx and sy are reaching outside of the domain,
|
||||
// while the check for the curcol being valid is
|
||||
// sufficient to check the z values
|
||||
if ((ix + sx >= 0) && (ix + sx < nx) && (iy + sy >= 0) &&
|
||||
(iy + sy < ny) && (curcol >= 0 && curcol < total_nrow)) {
|
||||
// This logic will skip over point that are not part of a
|
||||
// 7-pt stencil
|
||||
if (!use_7pt_stencil ||
|
||||
(sz * sz + sy * sy + sx * sx <= 1)) {
|
||||
if (curcol == currow) {
|
||||
*curvalptr++ = 27.0;
|
||||
} else {
|
||||
*curvalptr++ = -1.0;
|
||||
}
|
||||
*curindptr++ = curcol;
|
||||
nnzrow++;
|
||||
}
|
||||
}
|
||||
} // end sx loop
|
||||
} // end sy loop
|
||||
} // end sz loop
|
||||
|
||||
*currowptr = *(currowptr - 1) + nnzrow;
|
||||
// printf("%d:%d-%lld, ", currow, nnzrow, *currowptr);
|
||||
currowptr++;
|
||||
nnzglobal += nnzrow;
|
||||
x[curlocalrow] = 0.0;
|
||||
b[curlocalrow] = 27.0 - ((CG_FLOAT)(nnzrow - 1));
|
||||
xexact[curlocalrow] = 1.0;
|
||||
} // end ix loop
|
||||
} // end iy loop
|
||||
} // end iz loop
|
||||
|
||||
#ifdef VERBOSE
|
||||
printf("Process %d of %d has %d rows\n", rank, size, local_nrow);
|
||||
printf("Global rows %d through %d\n", start_row, stop_row);
|
||||
printf("%d nonzeros\n", start_row, stop_row);
|
||||
#endif /* ifdef VERBOSE */
|
||||
|
||||
// for (int i = 0; i < local_nnz; i++) {
|
||||
// printf("%d:%f, ", (int)m->colInd[i], m->val[i]);
|
||||
// }
|
||||
// printf("\n");
|
||||
|
||||
s->A.startRow = start_row;
|
||||
s->A.stopRow = stop_row;
|
||||
s->A.totalNr = total_nrow;
|
||||
s->A.totalNnz = total_nnz;
|
||||
s->A.nr = local_nrow;
|
||||
s->A.nc = local_nrow;
|
||||
s->A.nnz = local_nnz;
|
||||
}
|
||||
|
||||
void initSolver(Solver* s, Parameter* p)
|
||||
{
|
||||
if (!strcmp(p->filename, "generate")) {
|
||||
matrixGenerate(p, s, false);
|
||||
} else if (!strcmp(p->filename, "generate7P")) {
|
||||
matrixGenerate(p, s, true);
|
||||
} else {
|
||||
matrixRead(&s->A, p->filename);
|
||||
}
|
||||
}
|
||||
|
||||
void spMVM(Matrix* m, const CG_FLOAT* restrict x, CG_FLOAT* restrict y)
|
||||
{
|
||||
CG_UINT numRows = m->nr;
|
||||
CG_UINT* rowPtr = m->rowPtr;
|
||||
@ -26,3 +160,46 @@ void spMVM(Matrix* m, double* x, double* y)
|
||||
y[rowID] = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
void waxpby(const CG_UINT n,
|
||||
const CG_FLOAT alpha,
|
||||
const CG_FLOAT* restrict x,
|
||||
const CG_FLOAT beta,
|
||||
const CG_FLOAT* restrict y,
|
||||
CG_FLOAT* const w)
|
||||
{
|
||||
if (alpha == 1.0) {
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
w[i] = x[i] + beta * y[i];
|
||||
}
|
||||
} else if (beta == 1.0) {
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
w[i] = alpha * x[i] + y[i];
|
||||
}
|
||||
} else {
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
w[i] = alpha * x[i] + beta * y[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ddot(const CG_UINT n,
|
||||
const CG_FLOAT* restrict x,
|
||||
const CG_FLOAT* restrict y,
|
||||
CG_FLOAT* restrict result)
|
||||
{
|
||||
CG_FLOAT sum = 0.0;
|
||||
|
||||
if (y == x) {
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
sum += x[i] * x[i];
|
||||
}
|
||||
} else {
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
sum += x[i] * y[i];
|
||||
}
|
||||
}
|
||||
|
||||
commReduction(&sum, SUM);
|
||||
*result = sum;
|
||||
}
|
||||
|
28
src/solver.h
28
src/solver.h
@ -4,21 +4,29 @@
|
||||
* license that can be found in the LICENSE file. */
|
||||
#ifndef __SOLVER_H_
|
||||
#define __SOLVER_H_
|
||||
#include "comm.h"
|
||||
#include "matrix.h"
|
||||
#include "parameter.h"
|
||||
#include "util.h"
|
||||
|
||||
typedef struct {
|
||||
/* geometry and grid information */
|
||||
/* parameters */
|
||||
double eps, omega;
|
||||
int itermax;
|
||||
|
||||
/* communication */
|
||||
Comm* comm;
|
||||
Matrix A;
|
||||
CG_FLOAT* x;
|
||||
CG_FLOAT* b;
|
||||
CG_FLOAT* xexact;
|
||||
} Solver;
|
||||
|
||||
void initSolver(Solver*, Parameter*);
|
||||
void spMVM(Matrix* m, CG_FLOAT* x, CG_FLOAT* y);
|
||||
void initSolver(Solver* s, Parameter*);
|
||||
void spMVM(Matrix* m, const CG_FLOAT* restrict x, CG_FLOAT* restrict y);
|
||||
|
||||
void waxpby(const CG_UINT n,
|
||||
const CG_FLOAT alpha,
|
||||
const CG_FLOAT* restrict x,
|
||||
const CG_FLOAT beta,
|
||||
const CG_FLOAT* restrict y,
|
||||
double* restrict w);
|
||||
|
||||
void ddot(const CG_UINT n,
|
||||
const CG_FLOAT* restrict e,
|
||||
const CG_FLOAT* restrict y,
|
||||
CG_FLOAT* restrict result);
|
||||
#endif // __SOLVER_H_
|
||||
|
Loading…
Reference in New Issue
Block a user