Fix bugs and clean up. Add subroutines.

This commit is contained in:
Jan Eitzinger 2025-01-20 18:24:37 +01:00
parent 6f1932cf4f
commit 49dc2d97ae
7 changed files with 396 additions and 345 deletions

View File

@ -3,9 +3,9 @@
#============================================================================== #==============================================================================
filename generate filename generate
nx 50 nx 10
ny 50 ny 10
nz 50 nz 10
itermax 10000 itermax 10000
eps 0.0001 eps 0.0001

View File

@ -17,5 +17,5 @@ VERSION = --version
CFLAGS = -O3 -ffast-math -std=c99 $(OPENMP) CFLAGS = -O3 -ffast-math -std=c99 $(OPENMP)
#CFLAGS = -Ofast -fnt-store=aggressive -std=c99 $(OPENMP) #AMD CLANG #CFLAGS = -Ofast -fnt-store=aggressive -std=c99 $(OPENMP) #AMD CLANG
LFLAGS = $(OPENMP) LFLAGS = $(OPENMP)
DEFINES += -D_GNU_SOURCE DEFINES += -D_GNU_SOURCE# -DVERBOSE
INCLUDES = -I/Users/jan/.local/include INCLUDES = -I/Users/jan/.local/include

View File

@ -3,6 +3,7 @@
* Use of this source code is governed by a MIT style * Use of this source code is governed by a MIT style
* license that can be found in the LICENSE file. */ * license that can be found in the LICENSE file. */
#include "util.h" #include "util.h"
#include <stddef.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
@ -19,15 +20,162 @@ static int sizeOfRank(int rank, int size, int N)
return N / size + ((N % size > rank) ? 1 : 0); return N / size + ((N % size > rank) ? 1 : 0);
} }
void commReduction(double* v, int op) // Ensure that all the neighbors we expect to receive from also send to us
// sendList - All ranks receive values from us
// numSendNeighbors - Number of entries in sendList
// recvList - All ranks that send values from us
// numRecvNeighbors - Number of entries in recvList
// FIXME: What if ranks want to send to us and are not in sendlist?
static void probeNeighbors(
int* sendList, int numSendNeighbors, int* recvList, int numRecvNeighbors)
{ {
#ifdef _MPI
if (op == MAX) { int val;
MPI_Allreduce(MPI_IN_PLACE, v, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
} else if (op == SUM) { for (int i = 0; i < numSendNeighbors; i++) {
MPI_Allreduce(MPI_IN_PLACE, v, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); sendList[i] = -1;
}
int MPI_MY_TAG = 99;
MPI_Request request[MAX_NUM_MESSAGES];
for (int i = 0; i < numSendNeighbors; i++) {
MPI_Irecv(&val,
1,
MPI_INT,
MPI_ANY_SOURCE,
MPI_MY_TAG,
MPI_COMM_WORLD,
request + i);
}
for (int i = 0; i < numRecvNeighbors; i++) {
MPI_Send(&val, 1, MPI_INT, recvList[i], MPI_MY_TAG, MPI_COMM_WORLD);
}
// Receive message from each send neighbor to construct 'sendList'.
MPI_Status status;
for (int i = 0; i < numSendNeighbors; i++) {
if (MPI_Wait(request + i, &status)) {
printf("MPI_Wait error\n");
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
exit(EXIT_FAILURE);
}
sendList[i] = status.MPI_SOURCE;
}
}
static void buildNeighbors(Comm* c, int* external_processor)
{
int numNeighbors = c->numNeighbors;
int numExternal = c->numExternal;
int* neighbors = c->neighbors;
int lengths[numNeighbors];
int MPI_MY_TAG = 100;
MPI_Request request[MAX_NUM_MESSAGES];
// First post receives
for (int i = 0; i < numNeighbors; i++) {
MPI_Irecv(lengths + i,
1,
MPI_INT,
neighbors[i],
MPI_MY_TAG,
MPI_COMM_WORLD,
request + i);
}
int* recvCount = c->recvCount;
int* sendCount = c->sendCount;
int j = 0;
for (int i = 0; i < numNeighbors; i++) {
int count = 0;
// go through list of external elements until updating
// processor changes
while ((j < numExternal) && (external_processor[j] == neighbors[i])) {
count++;
j++;
if (j == numExternal) break;
}
recvCount[i] = count;
MPI_Send(&count, 1, MPI_INT, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD);
}
MPI_Status status;
// Complete the receives of the number of externals
for (int i = 0; i < numNeighbors; i++) {
if (MPI_Wait(request + i, &status)) {
printf("MPI_Wait error\n");
exit(EXIT_FAILURE);
}
sendCount[i] = lengths[i];
}
}
static void buildElementsToSend(
Comm* c, int startRow, int* external_processor, int* new_external)
{
int numNeighbors = c->numNeighbors;
int numExternal = c->numExternal;
int* neighbors = c->neighbors;
int MPI_MY_TAG = 100;
MPI_Request request[MAX_NUM_MESSAGES];
c->elementsToSend = (int*)allocate(ARRAY_ALIGNMENT,
c->totalSendCount * sizeof(int));
int* elementsToSend = c->elementsToSend;
int j = 0;
for (int i = 0; i < numNeighbors; i++) {
MPI_Irecv(elementsToSend + j,
c->sendCount[i],
MPI_INT,
neighbors[i],
MPI_MY_TAG,
MPI_COMM_WORLD,
request + i);
j += c->sendCount[i];
}
j = 0;
for (int i = 0; i < numNeighbors; i++) {
int start = j;
// Go through list of external elements
// until updating processor changes. This is redundant, but
// saves us from recording this information.
while ((j < numExternal) && (external_processor[j] == neighbors[i])) {
j++;
if (j == numExternal) break;
}
MPI_Send(new_external + start,
j - start,
MPI_INT,
neighbors[i],
MPI_MY_TAG,
MPI_COMM_WORLD);
}
MPI_Status status;
// receive from each neighbor the global index list of external elements
for (int i = 0; i < numNeighbors; i++) {
if (MPI_Wait(request + i, &status)) {
printf("MPI_Wait error\n");
exit(EXIT_FAILURE);
}
}
/// replace global indices by local indices
for (int i = 0; i < c->totalSendCount; i++) {
elementsToSend[i] -= startRow;
} }
#endif
} }
void commPartition(Comm* c, Matrix* A) void commPartition(Comm* c, Matrix* A)
@ -35,17 +183,16 @@ void commPartition(Comm* c, Matrix* A)
#ifdef _MPI #ifdef _MPI
int rank = c->rank; int rank = c->rank;
int size = c->size; int size = c->size;
MPI_Comm comm = c->comm;
// Extract Matrix pieces // Extract Matrix pieces
int start_row = A->startRow; CG_UINT start_row = A->startRow;
int stop_row = A->stopRow; CG_UINT stop_row = A->stopRow;
int total_nrow = A->totalNr; CG_UINT total_nrow = A->totalNr;
long long total_nnz = A->totalNnz; CG_UINT total_nnz = A->totalNnz;
int local_nrow = A->nr; CG_UINT local_nrow = A->nr;
int local_nnz = A->nnz; CG_UINT local_nnz = A->nnz;
int* row_ptr = (int*)A->rowPtr; CG_UINT* row_ptr = A->rowPtr;
int* col_ind = (int*)A->colInd; CG_UINT* col_ind = A->colInd;
// We need to convert the index values for the rows on this processor // We need to convert the index values for the rows on this processor
// to a local index space. We need to: // to a local index space. We need to:
@ -58,48 +205,54 @@ void commPartition(Comm* c, Matrix* A)
// - find out which processor owns the value. // - find out which processor owns the value.
// - Set up communication for sparse MV operation. // - Set up communication for sparse MV operation.
// Scan the indices and transform to local // FIXME: Use a lookup table with size total number of rows. For lower memory
int* externals = (int*)allocate(ARRAY_ALIGNMENT, A->totalNr * sizeof(int)); // consumption a map would be better choice.
int num_external = 1; int* externals = (int*)allocate(ARRAY_ALIGNMENT, total_nrow * sizeof(int));
int num_external = 0; // local number of external indices
// column indices that are not processed yet are marked with -1
for (int i = 0; i < total_nrow; i++) {
externals[i] = -1;
}
int* external_index = (int*)allocate(ARRAY_ALIGNMENT, int* external_index = (int*)allocate(ARRAY_ALIGNMENT,
MAX_EXTERNAL * sizeof(int)); MAX_EXTERNAL * sizeof(int));
c->external_index = external_index;
for (int i = 0; i < A->totalNr; i++) {
externals[i] = -1;
}
for (int i = 0; i < local_nrow; i++) { for (int i = 0; i < local_nrow; i++) {
for (int j = row_ptr[i]; j < row_ptr[i + 1]; j++) { for (int j = row_ptr[i]; j < row_ptr[i + 1]; j++) {
int cur_ind = A->colInd[j]; int cur_ind = A->colInd[j];
#ifdef VERBOSE #ifdef VERBOSE
printf("Process %d of %d getting index %d in local row %d\n", printf("Rank %d of %d getting entry %d:index %d in local row %d\n",
rank, rank,
size, size,
j,
cur_ind, cur_ind,
i); i);
#endif #endif
// shift local rows to the start // convert local column references to local numbering
if (start_row <= cur_ind && cur_ind <= stop_row) { if (start_row <= cur_ind && cur_ind <= stop_row) {
col_ind[j] -= start_row; col_ind[j] -= start_row;
} else { } else {
// Must find out if we have already set up this point // find out if we have already set up this point
if (externals[cur_ind] == -1) { if (externals[cur_ind] == -1) {
externals[cur_ind] = num_external++; externals[cur_ind] = num_external;
if (num_external <= MAX_EXTERNAL) { if (num_external <= MAX_EXTERNAL) {
external_index[num_external - 1] = cur_ind; external_index[num_external] = cur_ind;
// Mark index as external by negating it // mark in local column index as external by negating it
col_ind[j] = -(col_ind[j] + 1); // FIXME: Offset? // col_ind[j] = -(col_ind[j] + 1); // FIXME: Offset?
col_ind[j] = -col_ind[j];
} else { } else {
printf("Must increase MAX_EXTERNAL\n"); printf("Must increase MAX_EXTERNAL\n");
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
num_external++;
} else { } else {
// Mark index as external by negating it // Mark index as external by adding 1 and negating it
// col_ind[j] = -(col_ind[j] + 1); // FIXME: Offset?
col_ind[j] = -col_ind[j]; col_ind[j] = -col_ind[j];
} }
} }
@ -109,44 +262,42 @@ void commPartition(Comm* c, Matrix* A)
/************************************************************************** /**************************************************************************
Go through list of externals to find out which processors must be accessed. Go through list of externals to find out which processors must be accessed.
**************************************************************************/ **************************************************************************/
c->num_external = num_external;
int tmp_buffer[size];
int global_index_offsets[size];
for (int i = 0; i < size; i++) {
tmp_buffer[i] = 0;
}
tmp_buffer[rank] = start_row;
MPI_Allreduce(tmp_buffer,
global_index_offsets,
size,
MPI_INT,
MPI_SUM,
MPI_COMM_WORLD);
// Go through list of externals and find the processor that owns each
int external_processor[num_external]; int external_processor[num_external];
{
int globalIndexOffsets[size];
MPI_Allgather(&start_row,
1,
MPI_INT,
globalIndexOffsets,
1,
MPI_INT,
MPI_COMM_WORLD);
// for (int i = 0; i < size; i++) {
// printf("Rank %d: i = %d: OFFSET %d\n", rank, i, globalIndexOffsets[i]);
// }
// Go through list of externals and find the processor that owns each
for (int i = 0; i < num_external; i++) { for (int i = 0; i < num_external; i++) {
int cur_ind = external_index[i]; int globalIndex = external_index[i];
for (int j = size - 1; j >= 0; j--) { for (int j = size - 1; j >= 0; j--) {
if (global_index_offsets[j] <= cur_ind) { if (globalIndexOffsets[j] <= globalIndex) {
external_processor[i] = j; external_processor[i] = j;
break; break;
} }
} }
} }
}
/*Go through the external elements. For each newly encountered external /*Go through the external elements. For each newly encountered external
point assign it the next index in the local sequence. Then look for other point assign it the next index in the local sequence. Then look for other
external elements who are updated by the same node and assign them the next external elements who are updated by the same rank and assign them the next
set of index numbers in the local sequence (ie. elements updated by the same set of index numbers in the local sequence (ie. elements updated by the same
node have consecutive indices).*/ rank have consecutive indices).*/
int* external_local_index = (int*)allocate(ARRAY_ALIGNMENT, int* external_local_index = (int*)allocate(ARRAY_ALIGNMENT,
MAX_EXTERNAL * sizeof(int)); num_external * sizeof(int));
c->external_local_index = external_local_index;
int count = local_nrow; int count = local_nrow;
@ -172,27 +323,39 @@ void commPartition(Comm* c, Matrix* A)
for (int i = 0; i < local_nrow; i++) { for (int i = 0; i < local_nrow; i++) {
for (int j = rowPtr[i]; j < rowPtr[i + 1]; j++) { for (int j = rowPtr[i]; j < rowPtr[i + 1]; j++) {
if (col_ind[j] < 0) { if (col_ind[j] < 0) {
int cur_ind = -col_ind[j] - 1; // FIXME: Offset by 1?? // size_t cur_ind = -(col_ind[j] - 1); // FIXME: Offset by 1??
int cur_ind = -col_ind[j];
col_ind[j] = external_local_index[externals[cur_ind]]; col_ind[j] = external_local_index[externals[cur_ind]];
} }
} }
} }
int new_external_processor[num_external]; free(externals);
// int new_external_processor[num_external];
for (int i = 0; i < num_external; i++) { //
new_external_processor[i] = 0; // for (int i = 0; i < num_external; i++) {
} // new_external_processor[i] = -1;
// }
// setup map from external id to partition //
for (int i = 0; i < num_external; i++) { // // setup map from external id to partition
new_external_processor[external_local_index[i] - local_nrow] = // for (int i = 0; i < num_external; i++) {
external_processor[i]; // int id = external_local_index[i] - local_nrow;
} // new_external_processor[id] = external_processor[i];
// printf("Rank %d of %d: %d new_external_processor[%d] = %d\n",
// rank,
// size,
// i,
// id,
// external_processor[i]);
// }
// commFinalize(c);
// exit(EXIT_SUCCESS);
//
#ifdef VERBOSE #ifdef VERBOSE
printf("Rank %d of %d: %d externals\n", rank, size, num_external);
for (int i = 0; i < num_external; i++) { for (int i = 0; i < num_external; i++) {
printf("Process %d of %d: external process[%d] = %d\n", printf("Rank %d of %d: external[%d] owned by %d\n",
rank, rank,
size, size,
i, i,
@ -217,119 +380,93 @@ void commPartition(Comm* c, Matrix* A)
int num_recv_neighbors = 0; int num_recv_neighbors = 0;
int length = 1; int length = 1;
// Encoding both number of ranks that need values from this rank and the total
// number of values by adding one for any additional rank and adding size for
// every additional value.
for (int i = 0; i < num_external; i++) { for (int i = 0; i < num_external; i++) {
if (tmp_neighbors[new_external_processor[i]] == 0) { if (tmp_neighbors[external_processor[i]] == 0) {
num_recv_neighbors++; num_recv_neighbors++;
tmp_neighbors[new_external_processor[i]] = 1; tmp_neighbors[external_processor[i]] = 1;
} }
tmp_neighbors[new_external_processor[i]] += size; tmp_neighbors[external_processor[i]] += size;
} }
// sum over all processors all the tmp_neighbors arrays // sum over all processors all the tmp_neighbors arrays
MPI_Allreduce(tmp_neighbors, MPI_Allreduce(MPI_IN_PLACE,
tmp_buffer, tmp_neighbors,
size, size,
MPI_INT, MPI_INT,
MPI_SUM, MPI_SUM,
MPI_COMM_WORLD); MPI_COMM_WORLD);
/* decode the combined 'tmp_neighbors' (stored in tmp_buffer) array from all /* decode the combined 'tmp_neighbors' array from all ranks */
* the processors */ // Number of ranks that receive values from us
int num_send_neighbors = tmp_buffer[rank] % size; int num_send_neighbors = tmp_neighbors[rank] % size;
/* decode 'tmp_buffer[rank] to deduce total number of elements we must send */ /* decode 'tmp_neighbors[rank] to deduce total number of elements we must send
int total_to_be_sent = (tmp_buffer[rank] - num_send_neighbors) / size; */
c->totalSendCount = (tmp_neighbors[rank] - num_send_neighbors) / size;
/* Check to see if we have enough workspace allocated. This could be /* Check to see if we have enough memory allocated. This could be
dynamically modified, but let's keep it simple for now...*/ dynamically modified, but let's keep it simple for now...*/
if (num_send_neighbors > MAX_NUM_MESSAGES) { if (num_send_neighbors > MAX_NUM_MESSAGES) {
printf("Must increase MAX_NUM_MESSAGES. Must be at least %d\n", printf("Must increase MAX_NUM_MESSAGES. Must be at least %d\n",
num_send_neighbors); num_send_neighbors);
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
if (total_to_be_sent > MAX_EXTERNAL) { if (c->totalSendCount > MAX_EXTERNAL) {
printf("Must increase MAX_EXTERNAL. Must be at least %d\n", printf("Must increase MAX_EXTERNAL. Must be at least %d\n",
total_to_be_sent); c->totalSendCount);
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
#ifdef VERBOSE #ifdef VERBOSE
cout << "Processor " << rank << " of " << size printf("Rank %d of %d: tmp_neighbors = %d\n",
<< ": Number of send neighbors = " << num_send_neighbors << endl; rank,
size,
cout << "Processor " << rank << " of " << size tmp_neighbors[rank]);
<< ": Number of receive neighbors = " << num_recv_neighbors << endl; printf("Rank %d of %d: Number of send neighbors = %d\n",
rank,
cout << "Processor " << rank << " of " << size size,
<< ": Total number of elements to send = " << total_to_be_sent << endl; num_send_neighbors);
printf("Rank %d of %d: Number of receive neighbors = %d\n",
rank,
size,
num_recv_neighbors);
printf("Rank %d of %d: Total number of elements to send = %d\n",
rank,
size,
c->totalSendCount);
MPI_Barrier(MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD);
#endif #endif
/* Make a list of the neighbors that will send information to update our /* Make a list of the neighbors that will send information to update our
external elements (in the order that we will receive this information).*/ external elements (in the order that we will receive this information).*/
int* recv_list = allocate(ARRAY_ALIGNMENT, MAX_EXTERNAL * sizeof(int)); int* recv_list = allocate(ARRAY_ALIGNMENT, MAX_NUM_MESSAGES * sizeof(int));
// FIXME: Create local scope {
int j = 0; int j = 0;
recv_list[j++] = new_external_processor[0]; recv_list[j++] = external_processor[0];
for (int i = 1; i < num_external; i++) { for (int i = 1; i < num_external; i++) {
if (new_external_processor[i - 1] != new_external_processor[i]) { if (external_processor[i - 1] != external_processor[i]) {
recv_list[j++] = new_external_processor[i]; recv_list[j++] = external_processor[i];
}
} }
} }
// Ensure that all the neighbors we expect to receive from also send to us // Ensure that all the neighbors we expect to receive from also send to us
// Send a 0 length message to each of our recv neighbors
int send_list[num_send_neighbors]; int send_list[num_send_neighbors];
for (int i = 0; i < num_send_neighbors; i++) { probeNeighbors(send_list, num_send_neighbors, recv_list, num_recv_neighbors);
send_list[i] = 0;
}
// first post receives, these are immediate receives
// Do not wait for result to come, will do that at the
// wait call below.
int MPI_MY_TAG = 99;
MPI_Request request[MAX_NUM_MESSAGES];
for (int i = 0; i < num_send_neighbors; i++) {
MPI_Irecv(tmp_buffer + i,
1,
MPI_INT,
MPI_ANY_SOURCE,
MPI_MY_TAG,
MPI_COMM_WORLD,
request + i);
}
// send messages
for (int i = 0; i < num_recv_neighbors; i++) {
MPI_Send(tmp_buffer + i,
1,
MPI_INT,
recv_list[i],
MPI_MY_TAG,
MPI_COMM_WORLD);
}
// Receive message from each send neighbor to construct 'send_list'.
MPI_Status status;
for (int i = 0; i < num_send_neighbors; i++) {
if (MPI_Wait(request + i, &status)) {
printf("MPI_Wait error\n");
exit(EXIT_FAILURE);
}
send_list[i] = status.MPI_SOURCE;
}
/* Compare the two lists. In most cases they should be the same. /* Compare the two lists. In most cases they should be the same.
// However, if they are not then add new entries to the recv list // However, if they are not then add new entries to the recv list
// that are in the send list (but not already in the recv list). // that are in the send list (but not already in the recv list).
WHY!! This ensures that the sendlist is equal to the sendlist FIXME: WHY!! This ensures that the recv_list is equal to the sendlist
But why is this required? -> Just One neighbour list??*/ But why is this required? -> Just One neighbour list??*/
for (int j = 0; j < num_send_neighbors; j++) { for (int j = 0; j < num_send_neighbors; j++) {
int found = 0; int found = 0;
@ -343,31 +480,25 @@ void commPartition(Comm* c, Matrix* A)
rank, rank,
size, size,
num_recv_neighbors, num_recv_neighbors,
send_list[i]); send_list[j]);
#endif #endif
recv_list[num_recv_neighbors] = send_list[j]; recv_list[num_recv_neighbors] = send_list[j];
(num_recv_neighbors)++; num_recv_neighbors++;
} }
} }
num_send_neighbors = num_recv_neighbors;
if (num_send_neighbors > MAX_NUM_MESSAGES) { // From here on only have one neighbor list for both send and recv
c->numNeighbors = num_recv_neighbors;
for (int i = 0; i < c->numNeighbors; i++) {
c->neighbors[i] = recv_list[i];
}
if (c->numNeighbors > MAX_NUM_MESSAGES) {
printf("Must increase MAX_EXTERNAL\n"); printf("Must increase MAX_EXTERNAL\n");
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
// Start filling communication setup
// Create 'new_external' which explicitly put the external elements in the
// order given by 'external_local_index'
c->total_to_be_sent = total_to_be_sent;
int* elements_to_send = (int*)allocate(ARRAY_ALIGNMENT,
total_to_be_sent * sizeof(int));
c->elements_to_send = elements_to_send;
for (int i = 0; i < total_to_be_sent; i++) {
elements_to_send[i] = 0;
}
// Create 'new_external' which explicitly put the external elements in the // Create 'new_external' which explicitly put the external elements in the
// order given by 'external_local_index' // order given by 'external_local_index'
int* new_external = (int*)allocate(ARRAY_ALIGNMENT, int* new_external = (int*)allocate(ARRAY_ALIGNMENT,
@ -377,195 +508,100 @@ void commPartition(Comm* c, Matrix* A)
new_external[external_local_index[i] - local_nrow] = external_index[i]; new_external[external_local_index[i] - local_nrow] = external_index[i];
} }
free(external_local_index);
free(external_index);
c->numExternal = num_external;
buildNeighbors(c, external_processor);
// Send each processor the global index list of the external elements in the // Send each processor the global index list of the external elements in the
// order that I will want to receive them when updating my external elements // order that I will want to receive them when updating my external elements
int lengths[num_recv_neighbors]; // Build "elementsToSend" list. These are the x elements the current rank
MPI_MY_TAG++; // owns that need to be sent to other ranks
buildElementsToSend(c, A->startRow, external_processor, new_external);
// First post receives
for (int i = 0; i < num_recv_neighbors; i++) {
int partner = recv_list[i];
MPI_Irecv(lengths + i,
1,
MPI_INT,
partner,
MPI_MY_TAG,
MPI_COMM_WORLD,
request + i);
}
int* neighbors = c->neighbors;
int* recv_length = c->recv_length;
int* send_length = c->send_length;
j = 0;
for (int i = 0; i < num_recv_neighbors; i++) {
int start = j;
int newlength = 0;
// go through list of external elements until updating
// processor changes
while ((j < num_external) && (new_external_processor[j] == recv_list[i])) {
newlength++;
j++;
if (j == num_external) break;
}
recv_length[i] = newlength;
neighbors[i] = recv_list[i];
length = j - start;
MPI_Send(&length, 1, MPI_INT, recv_list[i], MPI_MY_TAG, MPI_COMM_WORLD);
}
// Complete the receives of the number of externals
for (int i = 0; i < num_recv_neighbors; i++) {
if (MPI_Wait(request + i, &status)) {
printf("MPI_Wait error\n");
exit(EXIT_FAILURE);
}
send_length[i] = lengths[i];
}
// Build "elements_to_send" list. These are the x elements I own
// that need to be sent to other processors.
MPI_MY_TAG++;
j = 0;
for (int i = 0; i < num_recv_neighbors; i++) {
MPI_Irecv(elements_to_send + j,
send_length[i],
MPI_INT,
neighbors[i],
MPI_MY_TAG,
MPI_COMM_WORLD,
request + i);
j += send_length[i];
}
j = 0;
for (int i = 0; i < num_recv_neighbors; i++) {
int start = j;
int newlength = 0;
// Go through list of external elements
// until updating processor changes. This is redundant, but
// saves us from recording this information.
while ((j < num_external) && (new_external_processor[j] == recv_list[i])) {
newlength++;
j++;
if (j == num_external) break;
}
MPI_Send(new_external + start,
j - start,
MPI_INT,
recv_list[i],
MPI_MY_TAG,
MPI_COMM_WORLD);
}
// receive from each neighbor the global index list of external elements
for (int i = 0; i < num_recv_neighbors; i++) {
if (MPI_Wait(request + i, &status)) {
printf("MPI_Wait error\n");
exit(EXIT_FAILURE);
}
}
/// replace global indices by local indices
for (int i = 0; i < total_to_be_sent; i++) {
elements_to_send[i] -= start_row;
}
// Finish up !!
c->num_send_neighbors = num_send_neighbors;
A->nc = A->nc + num_external; A->nc = A->nc + num_external;
c->send_buffer = (CG_FLOAT*)allocate(ARRAY_ALIGNMENT,
// Used in exchange c->totalSendCount * sizeof(CG_FLOAT));
CG_FLOAT* send_buffer = (CG_FLOAT*)allocate(ARRAY_ALIGNMENT,
total_to_be_sent * sizeof(CG_FLOAT));
c->send_buffer = send_buffer;
free(recv_list); free(recv_list);
free(new_external); free(new_external);
#endif #endif
} }
void commExchange(Comm* c, Matrix* A, double* x) void commExchange(Comm* c, Matrix* A, CG_FLOAT* x)
{ {
#ifdef _MPI #ifdef _MPI
int num_external = 0; int numNeighbors = c->numNeighbors;
// Extract Matrix pieces
int local_nrow = A->nr;
int num_neighbors = c->num_send_neighbors;
int* recv_length = c->recv_length;
int* send_length = c->send_length;
int* neighbors = c->neighbors; int* neighbors = c->neighbors;
double* send_buffer = c->send_buffer; int* recvCount = c->recvCount;
int total_to_be_sent = c->total_to_be_sent; int* sendCount = c->sendCount;
int* elements_to_send = c->elements_to_send; CG_FLOAT* sendBuffer = c->send_buffer;
int* elementsToSend = c->elementsToSend;
int rank = c->rank;
int size = c->size;
MPI_Comm comm = c->comm;
// first post receives, these are immediate receives
// Do not wait for result to come, will do that at the
// wait call below.
int MPI_MY_TAG = 99; int MPI_MY_TAG = 99;
MPI_Request request[numNeighbors];
MPI_Request request[num_neighbors];
// Externals are at end of locals // Externals are at end of locals
double* x_external = (double*)x + local_nrow; CG_FLOAT* externals = x + A->nr;
// Post receives first // Post receives
for (int i = 0; i < num_neighbors; i++) { for (int i = 0; i < numNeighbors; i++) {
int n_recv = recv_length[i]; int count = recvCount[i];
MPI_Irecv(x_external,
n_recv, MPI_Irecv(externals,
MPI_DOUBLE, count,
MPI_FLOAT_TYPE,
neighbors[i], neighbors[i],
MPI_MY_TAG, MPI_MY_TAG,
MPI_COMM_WORLD, MPI_COMM_WORLD,
request + i); request + i);
x_external += n_recv;
externals += count;
} }
// Fill up send buffer // Copy values for all ranks into send buffer
for (int i = 0; i < total_to_be_sent; i++) { for (int i = 0; i < c->totalSendCount; i++) {
send_buffer[i] = x[elements_to_send[i]]; sendBuffer[i] = x[elementsToSend[i]];
} }
// Send to each neighbor // Send to each neighbor
for (int i = 0; i < num_neighbors; i++) { for (int i = 0; i < numNeighbors; i++) {
int n_send = send_length[i]; int count = sendCount[i];
MPI_Send(send_buffer,
n_send, MPI_Send(sendBuffer,
MPI_DOUBLE, count,
MPI_FLOAT_TYPE,
neighbors[i], neighbors[i],
MPI_MY_TAG, MPI_MY_TAG,
MPI_COMM_WORLD); MPI_COMM_WORLD);
send_buffer += n_send;
sendBuffer += count;
} }
// Complete the reads issued above // Complete the receives issued above
MPI_Status status; MPI_Status status;
for (int i = 0; i < num_neighbors; i++) { for (int i = 0; i < numNeighbors; i++) {
if (MPI_Wait(request + i, &status)) { if (MPI_Wait(request + i, &status)) {
printf("MPI_Wait error\n"); printf("MPI_Wait error\n");
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
} }
} }
#endif #endif
} }
void commReduction(double* v, int op)
{
#ifdef _MPI
if (op == MAX) {
MPI_Allreduce(MPI_IN_PLACE, v, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
} else if (op == SUM) {
MPI_Allreduce(MPI_IN_PLACE, v, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
}
#endif
}
void commPrintConfig(Comm* c) void commPrintConfig(Comm* c)
{ {
#ifdef _MPI #ifdef _MPI
@ -577,6 +613,21 @@ void commPrintConfig(Comm* c)
for (int i = 0; i < c->size; i++) { for (int i = 0; i < c->size; i++) {
if (i == c->rank) { if (i == c->rank) {
printf("Rank %d has %d neighbors with %d externals:\n",
c->rank,
c->numNeighbors,
c->numExternal);
for (int j = 0; j < c->numNeighbors; j++) {
printf("\t%d: receive %d send %d\n",
c->neighbors[j],
c->recvCount[j],
c->sendCount[j]);
}
printf("\tSend %d elements: [", c->totalSendCount);
for (int j = 0; j < c->totalSendCount; j++) {
printf("%d, ", c->elementsToSend[j]);
}
printf("]\n");
fflush(stdout); fflush(stdout);
} }
MPI_Barrier(MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD);
@ -594,14 +645,14 @@ void commMatrixDump(Comm* c, Matrix* m)
CG_FLOAT* val = m->val; CG_FLOAT* val = m->val;
if (commIsMaster(c)) { if (commIsMaster(c)) {
printf("Matrix: %lld total non zeroes, total number of rows %lld\n", printf("Matrix: %d total non zeroes, total number of rows %d\n",
m->totalNnz, m->totalNnz,
m->totalNr); m->totalNr);
} }
for (int i = 0; i < size; i++) { for (int i = 0; i < size; i++) {
if (i == rank) { if (i == rank) {
printf("Rank %d: %lld non zeroes, number of rows %lld\n", printf("Rank %d: %d non zeroes, number of rows %d\n",
rank, rank,
m->nnz, m->nnz,
numRows); numRows);
@ -609,9 +660,9 @@ void commMatrixDump(Comm* c, Matrix* m)
for (int rowID = 0; rowID < numRows; rowID++) { for (int rowID = 0; rowID < numRows; rowID++) {
printf("Row [%d]: ", rowID); printf("Row [%d]: ", rowID);
for (size_t rowEntry = rowPtr[rowID]; rowEntry < rowPtr[rowID + 1]; for (int rowEntry = rowPtr[rowID]; rowEntry < rowPtr[rowID + 1];
rowEntry++) { rowEntry++) {
printf("[%lld]:%.2f ", colInd[rowEntry], val[rowEntry]); printf("[%d]:%.2f ", colInd[rowEntry], val[rowEntry]);
} }
printf("\n"); printf("\n");

View File

@ -4,6 +4,7 @@
* license that can be found in the LICENSE file. */ * license that can be found in the LICENSE file. */
#ifndef __COMM_H_ #ifndef __COMM_H_
#define __COMM_H_ #define __COMM_H_
#include "util.h"
#if defined(_MPI) #if defined(_MPI)
#include <mpi.h> #include <mpi.h>
#endif #endif
@ -20,18 +21,14 @@ typedef struct {
int rank; int rank;
int size; int size;
#if defined(_MPI) #if defined(_MPI)
MPI_Comm comm; int numNeighbors;
int numExternal;
int num_external; int totalSendCount;
int num_send_neighbors; int* elementsToSend;
int* external_index;
int* external_local_index;
int total_to_be_sent;
int* elements_to_send;
int neighbors[MAX_NUM_NEIGHBOURS]; int neighbors[MAX_NUM_NEIGHBOURS];
int recv_length[MAX_NUM_NEIGHBOURS]; int recvCount[MAX_NUM_NEIGHBOURS];
int send_length[MAX_NUM_NEIGHBOURS]; int sendCount[MAX_NUM_NEIGHBOURS];
double* send_buffer; CG_FLOAT* send_buffer;
#endif #endif
} Comm; } Comm;

View File

@ -55,9 +55,9 @@ int main(int argc, char** argv)
CG_FLOAT eps = (CG_FLOAT)param.eps; CG_FLOAT eps = (CG_FLOAT)param.eps;
int itermax = param.itermax; int itermax = param.itermax;
initSolver(&s, &comm, &param); initSolver(&s, &comm, &param);
commMatrixDump(&comm, &s.A); // commMatrixDump(&comm, &s.A);
commFinalize(&comm); commPartition(&comm, &s.A);
exit(EXIT_SUCCESS); commPrintConfig(&comm);
CG_UINT nrow = s.A.nr; CG_UINT nrow = s.A.nr;
CG_UINT ncol = s.A.nc; CG_UINT ncol = s.A.nc;

View File

@ -109,7 +109,7 @@ static void matrixGenerate(
#ifdef VERBOSE #ifdef VERBOSE
printf("Process %d of %d has %d rows\n", rank, size, local_nrow); printf("Process %d of %d has %d rows\n", rank, size, local_nrow);
printf("Global rows %d through %d\n", start_row, stop_row); printf("Global rows %d through %d\n", start_row, stop_row);
printf("%d nonzeros\n", start_row, stop_row); printf("%d nonzeros\n", local_nnz);
#endif /* ifdef VERBOSE */ #endif /* ifdef VERBOSE */
s->A.startRow = start_row; s->A.startRow = start_row;
@ -139,11 +139,11 @@ void spMVM(Matrix* m, const CG_FLOAT* restrict x, CG_FLOAT* restrict y)
CG_UINT* colInd = m->colInd; CG_UINT* colInd = m->colInd;
CG_FLOAT* val = m->val; CG_FLOAT* val = m->val;
for (size_t rowID = 0; rowID < numRows; rowID++) { for (int rowID = 0; rowID < numRows; rowID++) {
CG_FLOAT tmp = y[rowID]; CG_FLOAT tmp = y[rowID];
// loop over all elements in row // loop over all elements in row
for (size_t rowEntry = rowPtr[rowID]; rowEntry < rowPtr[rowID + 1]; for (int rowEntry = rowPtr[rowID]; rowEntry < rowPtr[rowID + 1];
rowEntry++) { rowEntry++) {
tmp += val[rowEntry] * x[colInd[rowEntry]]; tmp += val[rowEntry] * x[colInd[rowEntry]];
} }
@ -160,15 +160,15 @@ void waxpby(const CG_UINT n,
CG_FLOAT* const w) CG_FLOAT* const w)
{ {
if (alpha == 1.0) { if (alpha == 1.0) {
for (size_t i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
w[i] = x[i] + beta * y[i]; w[i] = x[i] + beta * y[i];
} }
} else if (beta == 1.0) { } else if (beta == 1.0) {
for (size_t i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
w[i] = alpha * x[i] + y[i]; w[i] = alpha * x[i] + y[i];
} }
} else { } else {
for (size_t i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
w[i] = alpha * x[i] + beta * y[i]; w[i] = alpha * x[i] + beta * y[i];
} }
} }
@ -182,11 +182,11 @@ void ddot(const CG_UINT n,
CG_FLOAT sum = 0.0; CG_FLOAT sum = 0.0;
if (y == x) { if (y == x) {
for (size_t i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
sum += x[i] * x[i]; sum += x[i] * x[i];
} }
} else { } else {
for (size_t i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
sum += x[i] * y[i]; sum += x[i] * y[i];
} }
} }

View File

@ -23,13 +23,16 @@
#define MAXLINE 4096 #define MAXLINE 4096
#endif #endif
#define CG_UINT unsigned long long int // #define CG_UINT unsigned long long int
#define CG_UINT int
#if PRECISION == 1 #if PRECISION == 1
#define CG_FLOAT float #define CG_FLOAT float
#define MPI_FLOAT_TYPE MPI_FLOAT
#define PRECISION_STRING "single" #define PRECISION_STRING "single"
#else #else
#define CG_FLOAT double #define CG_FLOAT double
#define MPI_FLOAT_TYPE MPI_DOUBLE
#define PRECISION_STRING "double" #define PRECISION_STRING "double"
#endif #endif