Fix bugs and clean up. Add subroutines.
This commit is contained in:
parent
6f1932cf4f
commit
49dc2d97ae
6
hpcg.par
6
hpcg.par
@ -3,9 +3,9 @@
|
||||
#==============================================================================
|
||||
|
||||
filename generate
|
||||
nx 50
|
||||
ny 50
|
||||
nz 50
|
||||
nx 10
|
||||
ny 10
|
||||
nz 10
|
||||
|
||||
itermax 10000
|
||||
eps 0.0001
|
||||
|
@ -17,5 +17,5 @@ VERSION = --version
|
||||
CFLAGS = -O3 -ffast-math -std=c99 $(OPENMP)
|
||||
#CFLAGS = -Ofast -fnt-store=aggressive -std=c99 $(OPENMP) #AMD CLANG
|
||||
LFLAGS = $(OPENMP)
|
||||
DEFINES += -D_GNU_SOURCE
|
||||
DEFINES += -D_GNU_SOURCE# -DVERBOSE
|
||||
INCLUDES = -I/Users/jan/.local/include
|
||||
|
667
src/comm.c
667
src/comm.c
@ -3,6 +3,7 @@
|
||||
* Use of this source code is governed by a MIT style
|
||||
* license that can be found in the LICENSE file. */
|
||||
#include "util.h"
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
@ -19,15 +20,162 @@ static int sizeOfRank(int rank, int size, int N)
|
||||
return N / size + ((N % size > rank) ? 1 : 0);
|
||||
}
|
||||
|
||||
void commReduction(double* v, int op)
|
||||
// Ensure that all the neighbors we expect to receive from also send to us
|
||||
// sendList - All ranks receive values from us
|
||||
// numSendNeighbors - Number of entries in sendList
|
||||
// recvList - All ranks that send values from us
|
||||
// numRecvNeighbors - Number of entries in recvList
|
||||
// FIXME: What if ranks want to send to us and are not in sendlist?
|
||||
static void probeNeighbors(
|
||||
int* sendList, int numSendNeighbors, int* recvList, int numRecvNeighbors)
|
||||
{
|
||||
#ifdef _MPI
|
||||
if (op == MAX) {
|
||||
MPI_Allreduce(MPI_IN_PLACE, v, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
|
||||
} else if (op == SUM) {
|
||||
MPI_Allreduce(MPI_IN_PLACE, v, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
|
||||
int val;
|
||||
|
||||
for (int i = 0; i < numSendNeighbors; i++) {
|
||||
sendList[i] = -1;
|
||||
}
|
||||
|
||||
int MPI_MY_TAG = 99;
|
||||
MPI_Request request[MAX_NUM_MESSAGES];
|
||||
|
||||
for (int i = 0; i < numSendNeighbors; i++) {
|
||||
MPI_Irecv(&val,
|
||||
1,
|
||||
MPI_INT,
|
||||
MPI_ANY_SOURCE,
|
||||
MPI_MY_TAG,
|
||||
MPI_COMM_WORLD,
|
||||
request + i);
|
||||
}
|
||||
|
||||
for (int i = 0; i < numRecvNeighbors; i++) {
|
||||
MPI_Send(&val, 1, MPI_INT, recvList[i], MPI_MY_TAG, MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
// Receive message from each send neighbor to construct 'sendList'.
|
||||
MPI_Status status;
|
||||
for (int i = 0; i < numSendNeighbors; i++) {
|
||||
if (MPI_Wait(request + i, &status)) {
|
||||
printf("MPI_Wait error\n");
|
||||
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
sendList[i] = status.MPI_SOURCE;
|
||||
}
|
||||
}
|
||||
|
||||
static void buildNeighbors(Comm* c, int* external_processor)
|
||||
{
|
||||
int numNeighbors = c->numNeighbors;
|
||||
int numExternal = c->numExternal;
|
||||
int* neighbors = c->neighbors;
|
||||
int lengths[numNeighbors];
|
||||
int MPI_MY_TAG = 100;
|
||||
MPI_Request request[MAX_NUM_MESSAGES];
|
||||
|
||||
// First post receives
|
||||
for (int i = 0; i < numNeighbors; i++) {
|
||||
MPI_Irecv(lengths + i,
|
||||
1,
|
||||
MPI_INT,
|
||||
neighbors[i],
|
||||
MPI_MY_TAG,
|
||||
MPI_COMM_WORLD,
|
||||
request + i);
|
||||
}
|
||||
|
||||
int* recvCount = c->recvCount;
|
||||
int* sendCount = c->sendCount;
|
||||
|
||||
int j = 0;
|
||||
|
||||
for (int i = 0; i < numNeighbors; i++) {
|
||||
int count = 0;
|
||||
|
||||
// go through list of external elements until updating
|
||||
// processor changes
|
||||
while ((j < numExternal) && (external_processor[j] == neighbors[i])) {
|
||||
count++;
|
||||
j++;
|
||||
if (j == numExternal) break;
|
||||
}
|
||||
|
||||
recvCount[i] = count;
|
||||
MPI_Send(&count, 1, MPI_INT, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
MPI_Status status;
|
||||
// Complete the receives of the number of externals
|
||||
for (int i = 0; i < numNeighbors; i++) {
|
||||
if (MPI_Wait(request + i, &status)) {
|
||||
printf("MPI_Wait error\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
sendCount[i] = lengths[i];
|
||||
}
|
||||
}
|
||||
|
||||
static void buildElementsToSend(
|
||||
Comm* c, int startRow, int* external_processor, int* new_external)
|
||||
{
|
||||
int numNeighbors = c->numNeighbors;
|
||||
int numExternal = c->numExternal;
|
||||
int* neighbors = c->neighbors;
|
||||
int MPI_MY_TAG = 100;
|
||||
MPI_Request request[MAX_NUM_MESSAGES];
|
||||
c->elementsToSend = (int*)allocate(ARRAY_ALIGNMENT,
|
||||
c->totalSendCount * sizeof(int));
|
||||
int* elementsToSend = c->elementsToSend;
|
||||
|
||||
int j = 0;
|
||||
|
||||
for (int i = 0; i < numNeighbors; i++) {
|
||||
MPI_Irecv(elementsToSend + j,
|
||||
c->sendCount[i],
|
||||
MPI_INT,
|
||||
neighbors[i],
|
||||
MPI_MY_TAG,
|
||||
MPI_COMM_WORLD,
|
||||
request + i);
|
||||
|
||||
j += c->sendCount[i];
|
||||
}
|
||||
|
||||
j = 0;
|
||||
|
||||
for (int i = 0; i < numNeighbors; i++) {
|
||||
int start = j;
|
||||
|
||||
// Go through list of external elements
|
||||
// until updating processor changes. This is redundant, but
|
||||
// saves us from recording this information.
|
||||
while ((j < numExternal) && (external_processor[j] == neighbors[i])) {
|
||||
j++;
|
||||
if (j == numExternal) break;
|
||||
}
|
||||
|
||||
MPI_Send(new_external + start,
|
||||
j - start,
|
||||
MPI_INT,
|
||||
neighbors[i],
|
||||
MPI_MY_TAG,
|
||||
MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
MPI_Status status;
|
||||
// receive from each neighbor the global index list of external elements
|
||||
for (int i = 0; i < numNeighbors; i++) {
|
||||
if (MPI_Wait(request + i, &status)) {
|
||||
printf("MPI_Wait error\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
/// replace global indices by local indices
|
||||
for (int i = 0; i < c->totalSendCount; i++) {
|
||||
elementsToSend[i] -= startRow;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void commPartition(Comm* c, Matrix* A)
|
||||
@ -35,17 +183,16 @@ void commPartition(Comm* c, Matrix* A)
|
||||
#ifdef _MPI
|
||||
int rank = c->rank;
|
||||
int size = c->size;
|
||||
MPI_Comm comm = c->comm;
|
||||
|
||||
// Extract Matrix pieces
|
||||
int start_row = A->startRow;
|
||||
int stop_row = A->stopRow;
|
||||
int total_nrow = A->totalNr;
|
||||
long long total_nnz = A->totalNnz;
|
||||
int local_nrow = A->nr;
|
||||
int local_nnz = A->nnz;
|
||||
int* row_ptr = (int*)A->rowPtr;
|
||||
int* col_ind = (int*)A->colInd;
|
||||
CG_UINT start_row = A->startRow;
|
||||
CG_UINT stop_row = A->stopRow;
|
||||
CG_UINT total_nrow = A->totalNr;
|
||||
CG_UINT total_nnz = A->totalNnz;
|
||||
CG_UINT local_nrow = A->nr;
|
||||
CG_UINT local_nnz = A->nnz;
|
||||
CG_UINT* row_ptr = A->rowPtr;
|
||||
CG_UINT* col_ind = A->colInd;
|
||||
|
||||
// We need to convert the index values for the rows on this processor
|
||||
// to a local index space. We need to:
|
||||
@ -58,48 +205,54 @@ void commPartition(Comm* c, Matrix* A)
|
||||
// - find out which processor owns the value.
|
||||
// - Set up communication for sparse MV operation.
|
||||
|
||||
// Scan the indices and transform to local
|
||||
int* externals = (int*)allocate(ARRAY_ALIGNMENT, A->totalNr * sizeof(int));
|
||||
int num_external = 1;
|
||||
// FIXME: Use a lookup table with size total number of rows. For lower memory
|
||||
// consumption a map would be better choice.
|
||||
int* externals = (int*)allocate(ARRAY_ALIGNMENT, total_nrow * sizeof(int));
|
||||
int num_external = 0; // local number of external indices
|
||||
|
||||
// column indices that are not processed yet are marked with -1
|
||||
for (int i = 0; i < total_nrow; i++) {
|
||||
externals[i] = -1;
|
||||
}
|
||||
|
||||
int* external_index = (int*)allocate(ARRAY_ALIGNMENT,
|
||||
MAX_EXTERNAL * sizeof(int));
|
||||
c->external_index = external_index;
|
||||
|
||||
for (int i = 0; i < A->totalNr; i++) {
|
||||
externals[i] = -1;
|
||||
}
|
||||
|
||||
for (int i = 0; i < local_nrow; i++) {
|
||||
for (int j = row_ptr[i]; j < row_ptr[i + 1]; j++) {
|
||||
int cur_ind = A->colInd[j];
|
||||
|
||||
#ifdef VERBOSE
|
||||
printf("Process %d of %d getting index %d in local row %d\n",
|
||||
printf("Rank %d of %d getting entry %d:index %d in local row %d\n",
|
||||
rank,
|
||||
size,
|
||||
j,
|
||||
cur_ind,
|
||||
i);
|
||||
#endif
|
||||
|
||||
// shift local rows to the start
|
||||
// convert local column references to local numbering
|
||||
if (start_row <= cur_ind && cur_ind <= stop_row) {
|
||||
col_ind[j] -= start_row;
|
||||
} else {
|
||||
// Must find out if we have already set up this point
|
||||
// find out if we have already set up this point
|
||||
if (externals[cur_ind] == -1) {
|
||||
externals[cur_ind] = num_external++;
|
||||
externals[cur_ind] = num_external;
|
||||
|
||||
if (num_external <= MAX_EXTERNAL) {
|
||||
external_index[num_external - 1] = cur_ind;
|
||||
// Mark index as external by negating it
|
||||
col_ind[j] = -(col_ind[j] + 1); // FIXME: Offset?
|
||||
external_index[num_external] = cur_ind;
|
||||
// mark in local column index as external by negating it
|
||||
// col_ind[j] = -(col_ind[j] + 1); // FIXME: Offset?
|
||||
col_ind[j] = -col_ind[j];
|
||||
} else {
|
||||
printf("Must increase MAX_EXTERNAL\n");
|
||||
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
num_external++;
|
||||
} else {
|
||||
// Mark index as external by negating it
|
||||
// Mark index as external by adding 1 and negating it
|
||||
// col_ind[j] = -(col_ind[j] + 1); // FIXME: Offset?
|
||||
col_ind[j] = -col_ind[j];
|
||||
}
|
||||
}
|
||||
@ -109,44 +262,42 @@ void commPartition(Comm* c, Matrix* A)
|
||||
/**************************************************************************
|
||||
Go through list of externals to find out which processors must be accessed.
|
||||
**************************************************************************/
|
||||
c->num_external = num_external;
|
||||
int tmp_buffer[size];
|
||||
int global_index_offsets[size];
|
||||
|
||||
for (int i = 0; i < size; i++) {
|
||||
tmp_buffer[i] = 0;
|
||||
}
|
||||
|
||||
tmp_buffer[rank] = start_row;
|
||||
|
||||
MPI_Allreduce(tmp_buffer,
|
||||
global_index_offsets,
|
||||
size,
|
||||
MPI_INT,
|
||||
MPI_SUM,
|
||||
MPI_COMM_WORLD);
|
||||
|
||||
// Go through list of externals and find the processor that owns each
|
||||
int external_processor[num_external];
|
||||
|
||||
{
|
||||
int globalIndexOffsets[size];
|
||||
|
||||
MPI_Allgather(&start_row,
|
||||
1,
|
||||
MPI_INT,
|
||||
globalIndexOffsets,
|
||||
1,
|
||||
MPI_INT,
|
||||
MPI_COMM_WORLD);
|
||||
|
||||
// for (int i = 0; i < size; i++) {
|
||||
// printf("Rank %d: i = %d: OFFSET %d\n", rank, i, globalIndexOffsets[i]);
|
||||
// }
|
||||
|
||||
// Go through list of externals and find the processor that owns each
|
||||
|
||||
for (int i = 0; i < num_external; i++) {
|
||||
int cur_ind = external_index[i];
|
||||
int globalIndex = external_index[i];
|
||||
for (int j = size - 1; j >= 0; j--) {
|
||||
if (global_index_offsets[j] <= cur_ind) {
|
||||
if (globalIndexOffsets[j] <= globalIndex) {
|
||||
external_processor[i] = j;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
/*Go through the external elements. For each newly encountered external
|
||||
point assign it the next index in the local sequence. Then look for other
|
||||
external elements who are updated by the same node and assign them the next
|
||||
external elements who are updated by the same rank and assign them the next
|
||||
set of index numbers in the local sequence (ie. elements updated by the same
|
||||
node have consecutive indices).*/
|
||||
rank have consecutive indices).*/
|
||||
int* external_local_index = (int*)allocate(ARRAY_ALIGNMENT,
|
||||
MAX_EXTERNAL * sizeof(int));
|
||||
c->external_local_index = external_local_index;
|
||||
num_external * sizeof(int));
|
||||
|
||||
int count = local_nrow;
|
||||
|
||||
@ -172,27 +323,39 @@ void commPartition(Comm* c, Matrix* A)
|
||||
for (int i = 0; i < local_nrow; i++) {
|
||||
for (int j = rowPtr[i]; j < rowPtr[i + 1]; j++) {
|
||||
if (col_ind[j] < 0) {
|
||||
int cur_ind = -col_ind[j] - 1; // FIXME: Offset by 1??
|
||||
// size_t cur_ind = -(col_ind[j] - 1); // FIXME: Offset by 1??
|
||||
int cur_ind = -col_ind[j];
|
||||
col_ind[j] = external_local_index[externals[cur_ind]];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int new_external_processor[num_external];
|
||||
|
||||
for (int i = 0; i < num_external; i++) {
|
||||
new_external_processor[i] = 0;
|
||||
}
|
||||
|
||||
// setup map from external id to partition
|
||||
for (int i = 0; i < num_external; i++) {
|
||||
new_external_processor[external_local_index[i] - local_nrow] =
|
||||
external_processor[i];
|
||||
}
|
||||
|
||||
free(externals);
|
||||
// int new_external_processor[num_external];
|
||||
//
|
||||
// for (int i = 0; i < num_external; i++) {
|
||||
// new_external_processor[i] = -1;
|
||||
// }
|
||||
//
|
||||
// // setup map from external id to partition
|
||||
// for (int i = 0; i < num_external; i++) {
|
||||
// int id = external_local_index[i] - local_nrow;
|
||||
// new_external_processor[id] = external_processor[i];
|
||||
// printf("Rank %d of %d: %d new_external_processor[%d] = %d\n",
|
||||
// rank,
|
||||
// size,
|
||||
// i,
|
||||
// id,
|
||||
// external_processor[i]);
|
||||
// }
|
||||
// commFinalize(c);
|
||||
// exit(EXIT_SUCCESS);
|
||||
//
|
||||
#ifdef VERBOSE
|
||||
printf("Rank %d of %d: %d externals\n", rank, size, num_external);
|
||||
|
||||
for (int i = 0; i < num_external; i++) {
|
||||
printf("Process %d of %d: external process[%d] = %d\n",
|
||||
printf("Rank %d of %d: external[%d] owned by %d\n",
|
||||
rank,
|
||||
size,
|
||||
i,
|
||||
@ -217,119 +380,93 @@ void commPartition(Comm* c, Matrix* A)
|
||||
int num_recv_neighbors = 0;
|
||||
int length = 1;
|
||||
|
||||
// Encoding both number of ranks that need values from this rank and the total
|
||||
// number of values by adding one for any additional rank and adding size for
|
||||
// every additional value.
|
||||
for (int i = 0; i < num_external; i++) {
|
||||
if (tmp_neighbors[new_external_processor[i]] == 0) {
|
||||
if (tmp_neighbors[external_processor[i]] == 0) {
|
||||
num_recv_neighbors++;
|
||||
tmp_neighbors[new_external_processor[i]] = 1;
|
||||
tmp_neighbors[external_processor[i]] = 1;
|
||||
}
|
||||
tmp_neighbors[new_external_processor[i]] += size;
|
||||
tmp_neighbors[external_processor[i]] += size;
|
||||
}
|
||||
|
||||
// sum over all processors all the tmp_neighbors arrays
|
||||
MPI_Allreduce(tmp_neighbors,
|
||||
tmp_buffer,
|
||||
MPI_Allreduce(MPI_IN_PLACE,
|
||||
tmp_neighbors,
|
||||
size,
|
||||
MPI_INT,
|
||||
MPI_SUM,
|
||||
MPI_COMM_WORLD);
|
||||
|
||||
/* decode the combined 'tmp_neighbors' (stored in tmp_buffer) array from all
|
||||
* the processors */
|
||||
int num_send_neighbors = tmp_buffer[rank] % size;
|
||||
/* decode the combined 'tmp_neighbors' array from all ranks */
|
||||
// Number of ranks that receive values from us
|
||||
int num_send_neighbors = tmp_neighbors[rank] % size;
|
||||
|
||||
/* decode 'tmp_buffer[rank] to deduce total number of elements we must send */
|
||||
int total_to_be_sent = (tmp_buffer[rank] - num_send_neighbors) / size;
|
||||
/* decode 'tmp_neighbors[rank] to deduce total number of elements we must send
|
||||
*/
|
||||
c->totalSendCount = (tmp_neighbors[rank] - num_send_neighbors) / size;
|
||||
|
||||
/* Check to see if we have enough workspace allocated. This could be
|
||||
/* Check to see if we have enough memory allocated. This could be
|
||||
dynamically modified, but let's keep it simple for now...*/
|
||||
if (num_send_neighbors > MAX_NUM_MESSAGES) {
|
||||
printf("Must increase MAX_NUM_MESSAGES. Must be at least %d\n",
|
||||
num_send_neighbors);
|
||||
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
if (total_to_be_sent > MAX_EXTERNAL) {
|
||||
if (c->totalSendCount > MAX_EXTERNAL) {
|
||||
printf("Must increase MAX_EXTERNAL. Must be at least %d\n",
|
||||
total_to_be_sent);
|
||||
c->totalSendCount);
|
||||
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
#ifdef VERBOSE
|
||||
cout << "Processor " << rank << " of " << size
|
||||
<< ": Number of send neighbors = " << num_send_neighbors << endl;
|
||||
|
||||
cout << "Processor " << rank << " of " << size
|
||||
<< ": Number of receive neighbors = " << num_recv_neighbors << endl;
|
||||
|
||||
cout << "Processor " << rank << " of " << size
|
||||
<< ": Total number of elements to send = " << total_to_be_sent << endl;
|
||||
|
||||
printf("Rank %d of %d: tmp_neighbors = %d\n",
|
||||
rank,
|
||||
size,
|
||||
tmp_neighbors[rank]);
|
||||
printf("Rank %d of %d: Number of send neighbors = %d\n",
|
||||
rank,
|
||||
size,
|
||||
num_send_neighbors);
|
||||
printf("Rank %d of %d: Number of receive neighbors = %d\n",
|
||||
rank,
|
||||
size,
|
||||
num_recv_neighbors);
|
||||
printf("Rank %d of %d: Total number of elements to send = %d\n",
|
||||
rank,
|
||||
size,
|
||||
c->totalSendCount);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
#endif
|
||||
|
||||
/* Make a list of the neighbors that will send information to update our
|
||||
external elements (in the order that we will receive this information).*/
|
||||
int* recv_list = allocate(ARRAY_ALIGNMENT, MAX_EXTERNAL * sizeof(int));
|
||||
int* recv_list = allocate(ARRAY_ALIGNMENT, MAX_NUM_MESSAGES * sizeof(int));
|
||||
|
||||
// FIXME: Create local scope
|
||||
{
|
||||
int j = 0;
|
||||
recv_list[j++] = new_external_processor[0];
|
||||
recv_list[j++] = external_processor[0];
|
||||
|
||||
for (int i = 1; i < num_external; i++) {
|
||||
if (new_external_processor[i - 1] != new_external_processor[i]) {
|
||||
recv_list[j++] = new_external_processor[i];
|
||||
if (external_processor[i - 1] != external_processor[i]) {
|
||||
recv_list[j++] = external_processor[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure that all the neighbors we expect to receive from also send to us
|
||||
// Send a 0 length message to each of our recv neighbors
|
||||
int send_list[num_send_neighbors];
|
||||
|
||||
for (int i = 0; i < num_send_neighbors; i++) {
|
||||
send_list[i] = 0;
|
||||
}
|
||||
|
||||
// first post receives, these are immediate receives
|
||||
// Do not wait for result to come, will do that at the
|
||||
// wait call below.
|
||||
int MPI_MY_TAG = 99;
|
||||
|
||||
MPI_Request request[MAX_NUM_MESSAGES];
|
||||
|
||||
for (int i = 0; i < num_send_neighbors; i++) {
|
||||
MPI_Irecv(tmp_buffer + i,
|
||||
1,
|
||||
MPI_INT,
|
||||
MPI_ANY_SOURCE,
|
||||
MPI_MY_TAG,
|
||||
MPI_COMM_WORLD,
|
||||
request + i);
|
||||
}
|
||||
|
||||
// send messages
|
||||
for (int i = 0; i < num_recv_neighbors; i++) {
|
||||
MPI_Send(tmp_buffer + i,
|
||||
1,
|
||||
MPI_INT,
|
||||
recv_list[i],
|
||||
MPI_MY_TAG,
|
||||
MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
// Receive message from each send neighbor to construct 'send_list'.
|
||||
MPI_Status status;
|
||||
for (int i = 0; i < num_send_neighbors; i++) {
|
||||
if (MPI_Wait(request + i, &status)) {
|
||||
printf("MPI_Wait error\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
send_list[i] = status.MPI_SOURCE;
|
||||
}
|
||||
probeNeighbors(send_list, num_send_neighbors, recv_list, num_recv_neighbors);
|
||||
|
||||
/* Compare the two lists. In most cases they should be the same.
|
||||
// However, if they are not then add new entries to the recv list
|
||||
// that are in the send list (but not already in the recv list).
|
||||
WHY!! This ensures that the sendlist is equal to the sendlist
|
||||
FIXME: WHY!! This ensures that the recv_list is equal to the sendlist
|
||||
But why is this required? -> Just One neighbour list??*/
|
||||
for (int j = 0; j < num_send_neighbors; j++) {
|
||||
int found = 0;
|
||||
@ -343,31 +480,25 @@ void commPartition(Comm* c, Matrix* A)
|
||||
rank,
|
||||
size,
|
||||
num_recv_neighbors,
|
||||
send_list[i]);
|
||||
send_list[j]);
|
||||
#endif
|
||||
recv_list[num_recv_neighbors] = send_list[j];
|
||||
(num_recv_neighbors)++;
|
||||
num_recv_neighbors++;
|
||||
}
|
||||
}
|
||||
num_send_neighbors = num_recv_neighbors;
|
||||
|
||||
if (num_send_neighbors > MAX_NUM_MESSAGES) {
|
||||
// From here on only have one neighbor list for both send and recv
|
||||
c->numNeighbors = num_recv_neighbors;
|
||||
for (int i = 0; i < c->numNeighbors; i++) {
|
||||
c->neighbors[i] = recv_list[i];
|
||||
}
|
||||
|
||||
if (c->numNeighbors > MAX_NUM_MESSAGES) {
|
||||
printf("Must increase MAX_EXTERNAL\n");
|
||||
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
// Start filling communication setup
|
||||
// Create 'new_external' which explicitly put the external elements in the
|
||||
// order given by 'external_local_index'
|
||||
c->total_to_be_sent = total_to_be_sent;
|
||||
int* elements_to_send = (int*)allocate(ARRAY_ALIGNMENT,
|
||||
total_to_be_sent * sizeof(int));
|
||||
c->elements_to_send = elements_to_send;
|
||||
|
||||
for (int i = 0; i < total_to_be_sent; i++) {
|
||||
elements_to_send[i] = 0;
|
||||
}
|
||||
|
||||
// Create 'new_external' which explicitly put the external elements in the
|
||||
// order given by 'external_local_index'
|
||||
int* new_external = (int*)allocate(ARRAY_ALIGNMENT,
|
||||
@ -377,195 +508,100 @@ void commPartition(Comm* c, Matrix* A)
|
||||
new_external[external_local_index[i] - local_nrow] = external_index[i];
|
||||
}
|
||||
|
||||
free(external_local_index);
|
||||
free(external_index);
|
||||
c->numExternal = num_external;
|
||||
|
||||
buildNeighbors(c, external_processor);
|
||||
|
||||
// Send each processor the global index list of the external elements in the
|
||||
// order that I will want to receive them when updating my external elements
|
||||
int lengths[num_recv_neighbors];
|
||||
MPI_MY_TAG++;
|
||||
// Build "elementsToSend" list. These are the x elements the current rank
|
||||
// owns that need to be sent to other ranks
|
||||
buildElementsToSend(c, A->startRow, external_processor, new_external);
|
||||
|
||||
// First post receives
|
||||
for (int i = 0; i < num_recv_neighbors; i++) {
|
||||
int partner = recv_list[i];
|
||||
MPI_Irecv(lengths + i,
|
||||
1,
|
||||
MPI_INT,
|
||||
partner,
|
||||
MPI_MY_TAG,
|
||||
MPI_COMM_WORLD,
|
||||
request + i);
|
||||
}
|
||||
|
||||
int* neighbors = c->neighbors;
|
||||
int* recv_length = c->recv_length;
|
||||
int* send_length = c->send_length;
|
||||
|
||||
j = 0;
|
||||
|
||||
for (int i = 0; i < num_recv_neighbors; i++) {
|
||||
int start = j;
|
||||
int newlength = 0;
|
||||
|
||||
// go through list of external elements until updating
|
||||
// processor changes
|
||||
while ((j < num_external) && (new_external_processor[j] == recv_list[i])) {
|
||||
newlength++;
|
||||
j++;
|
||||
if (j == num_external) break;
|
||||
}
|
||||
|
||||
recv_length[i] = newlength;
|
||||
neighbors[i] = recv_list[i];
|
||||
|
||||
length = j - start;
|
||||
MPI_Send(&length, 1, MPI_INT, recv_list[i], MPI_MY_TAG, MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
// Complete the receives of the number of externals
|
||||
for (int i = 0; i < num_recv_neighbors; i++) {
|
||||
if (MPI_Wait(request + i, &status)) {
|
||||
printf("MPI_Wait error\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
send_length[i] = lengths[i];
|
||||
}
|
||||
|
||||
// Build "elements_to_send" list. These are the x elements I own
|
||||
// that need to be sent to other processors.
|
||||
MPI_MY_TAG++;
|
||||
|
||||
j = 0;
|
||||
|
||||
for (int i = 0; i < num_recv_neighbors; i++) {
|
||||
MPI_Irecv(elements_to_send + j,
|
||||
send_length[i],
|
||||
MPI_INT,
|
||||
neighbors[i],
|
||||
MPI_MY_TAG,
|
||||
MPI_COMM_WORLD,
|
||||
request + i);
|
||||
j += send_length[i];
|
||||
}
|
||||
|
||||
j = 0;
|
||||
|
||||
for (int i = 0; i < num_recv_neighbors; i++) {
|
||||
int start = j;
|
||||
int newlength = 0;
|
||||
|
||||
// Go through list of external elements
|
||||
// until updating processor changes. This is redundant, but
|
||||
// saves us from recording this information.
|
||||
while ((j < num_external) && (new_external_processor[j] == recv_list[i])) {
|
||||
|
||||
newlength++;
|
||||
j++;
|
||||
if (j == num_external) break;
|
||||
}
|
||||
MPI_Send(new_external + start,
|
||||
j - start,
|
||||
MPI_INT,
|
||||
recv_list[i],
|
||||
MPI_MY_TAG,
|
||||
MPI_COMM_WORLD);
|
||||
}
|
||||
|
||||
// receive from each neighbor the global index list of external elements
|
||||
for (int i = 0; i < num_recv_neighbors; i++) {
|
||||
if (MPI_Wait(request + i, &status)) {
|
||||
printf("MPI_Wait error\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
|
||||
/// replace global indices by local indices
|
||||
for (int i = 0; i < total_to_be_sent; i++) {
|
||||
elements_to_send[i] -= start_row;
|
||||
}
|
||||
|
||||
// Finish up !!
|
||||
c->num_send_neighbors = num_send_neighbors;
|
||||
A->nc = A->nc + num_external;
|
||||
|
||||
// Used in exchange
|
||||
CG_FLOAT* send_buffer = (CG_FLOAT*)allocate(ARRAY_ALIGNMENT,
|
||||
total_to_be_sent * sizeof(CG_FLOAT));
|
||||
c->send_buffer = send_buffer;
|
||||
c->send_buffer = (CG_FLOAT*)allocate(ARRAY_ALIGNMENT,
|
||||
c->totalSendCount * sizeof(CG_FLOAT));
|
||||
|
||||
free(recv_list);
|
||||
free(new_external);
|
||||
#endif
|
||||
}
|
||||
|
||||
void commExchange(Comm* c, Matrix* A, double* x)
|
||||
void commExchange(Comm* c, Matrix* A, CG_FLOAT* x)
|
||||
{
|
||||
#ifdef _MPI
|
||||
int num_external = 0;
|
||||
|
||||
// Extract Matrix pieces
|
||||
|
||||
int local_nrow = A->nr;
|
||||
int num_neighbors = c->num_send_neighbors;
|
||||
int* recv_length = c->recv_length;
|
||||
int* send_length = c->send_length;
|
||||
int numNeighbors = c->numNeighbors;
|
||||
int* neighbors = c->neighbors;
|
||||
double* send_buffer = c->send_buffer;
|
||||
int total_to_be_sent = c->total_to_be_sent;
|
||||
int* elements_to_send = c->elements_to_send;
|
||||
int* recvCount = c->recvCount;
|
||||
int* sendCount = c->sendCount;
|
||||
CG_FLOAT* sendBuffer = c->send_buffer;
|
||||
int* elementsToSend = c->elementsToSend;
|
||||
|
||||
int rank = c->rank;
|
||||
int size = c->size;
|
||||
MPI_Comm comm = c->comm;
|
||||
|
||||
// first post receives, these are immediate receives
|
||||
// Do not wait for result to come, will do that at the
|
||||
// wait call below.
|
||||
int MPI_MY_TAG = 99;
|
||||
|
||||
MPI_Request request[num_neighbors];
|
||||
MPI_Request request[numNeighbors];
|
||||
|
||||
// Externals are at end of locals
|
||||
double* x_external = (double*)x + local_nrow;
|
||||
CG_FLOAT* externals = x + A->nr;
|
||||
|
||||
// Post receives first
|
||||
for (int i = 0; i < num_neighbors; i++) {
|
||||
int n_recv = recv_length[i];
|
||||
MPI_Irecv(x_external,
|
||||
n_recv,
|
||||
MPI_DOUBLE,
|
||||
// Post receives
|
||||
for (int i = 0; i < numNeighbors; i++) {
|
||||
int count = recvCount[i];
|
||||
|
||||
MPI_Irecv(externals,
|
||||
count,
|
||||
MPI_FLOAT_TYPE,
|
||||
neighbors[i],
|
||||
MPI_MY_TAG,
|
||||
MPI_COMM_WORLD,
|
||||
request + i);
|
||||
x_external += n_recv;
|
||||
|
||||
externals += count;
|
||||
}
|
||||
|
||||
// Fill up send buffer
|
||||
for (int i = 0; i < total_to_be_sent; i++) {
|
||||
send_buffer[i] = x[elements_to_send[i]];
|
||||
// Copy values for all ranks into send buffer
|
||||
for (int i = 0; i < c->totalSendCount; i++) {
|
||||
sendBuffer[i] = x[elementsToSend[i]];
|
||||
}
|
||||
|
||||
// Send to each neighbor
|
||||
for (int i = 0; i < num_neighbors; i++) {
|
||||
int n_send = send_length[i];
|
||||
MPI_Send(send_buffer,
|
||||
n_send,
|
||||
MPI_DOUBLE,
|
||||
for (int i = 0; i < numNeighbors; i++) {
|
||||
int count = sendCount[i];
|
||||
|
||||
MPI_Send(sendBuffer,
|
||||
count,
|
||||
MPI_FLOAT_TYPE,
|
||||
neighbors[i],
|
||||
MPI_MY_TAG,
|
||||
MPI_COMM_WORLD);
|
||||
send_buffer += n_send;
|
||||
|
||||
sendBuffer += count;
|
||||
}
|
||||
|
||||
// Complete the reads issued above
|
||||
// Complete the receives issued above
|
||||
MPI_Status status;
|
||||
for (int i = 0; i < num_neighbors; i++) {
|
||||
for (int i = 0; i < numNeighbors; i++) {
|
||||
if (MPI_Wait(request + i, &status)) {
|
||||
printf("MPI_Wait error\n");
|
||||
MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void commReduction(double* v, int op)
|
||||
{
|
||||
#ifdef _MPI
|
||||
if (op == MAX) {
|
||||
MPI_Allreduce(MPI_IN_PLACE, v, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
|
||||
} else if (op == SUM) {
|
||||
MPI_Allreduce(MPI_IN_PLACE, v, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void commPrintConfig(Comm* c)
|
||||
{
|
||||
#ifdef _MPI
|
||||
@ -577,6 +613,21 @@ void commPrintConfig(Comm* c)
|
||||
|
||||
for (int i = 0; i < c->size; i++) {
|
||||
if (i == c->rank) {
|
||||
printf("Rank %d has %d neighbors with %d externals:\n",
|
||||
c->rank,
|
||||
c->numNeighbors,
|
||||
c->numExternal);
|
||||
for (int j = 0; j < c->numNeighbors; j++) {
|
||||
printf("\t%d: receive %d send %d\n",
|
||||
c->neighbors[j],
|
||||
c->recvCount[j],
|
||||
c->sendCount[j]);
|
||||
}
|
||||
printf("\tSend %d elements: [", c->totalSendCount);
|
||||
for (int j = 0; j < c->totalSendCount; j++) {
|
||||
printf("%d, ", c->elementsToSend[j]);
|
||||
}
|
||||
printf("]\n");
|
||||
fflush(stdout);
|
||||
}
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
@ -594,14 +645,14 @@ void commMatrixDump(Comm* c, Matrix* m)
|
||||
CG_FLOAT* val = m->val;
|
||||
|
||||
if (commIsMaster(c)) {
|
||||
printf("Matrix: %lld total non zeroes, total number of rows %lld\n",
|
||||
printf("Matrix: %d total non zeroes, total number of rows %d\n",
|
||||
m->totalNnz,
|
||||
m->totalNr);
|
||||
}
|
||||
|
||||
for (int i = 0; i < size; i++) {
|
||||
if (i == rank) {
|
||||
printf("Rank %d: %lld non zeroes, number of rows %lld\n",
|
||||
printf("Rank %d: %d non zeroes, number of rows %d\n",
|
||||
rank,
|
||||
m->nnz,
|
||||
numRows);
|
||||
@ -609,9 +660,9 @@ void commMatrixDump(Comm* c, Matrix* m)
|
||||
for (int rowID = 0; rowID < numRows; rowID++) {
|
||||
printf("Row [%d]: ", rowID);
|
||||
|
||||
for (size_t rowEntry = rowPtr[rowID]; rowEntry < rowPtr[rowID + 1];
|
||||
for (int rowEntry = rowPtr[rowID]; rowEntry < rowPtr[rowID + 1];
|
||||
rowEntry++) {
|
||||
printf("[%lld]:%.2f ", colInd[rowEntry], val[rowEntry]);
|
||||
printf("[%d]:%.2f ", colInd[rowEntry], val[rowEntry]);
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
|
19
src/comm.h
19
src/comm.h
@ -4,6 +4,7 @@
|
||||
* license that can be found in the LICENSE file. */
|
||||
#ifndef __COMM_H_
|
||||
#define __COMM_H_
|
||||
#include "util.h"
|
||||
#if defined(_MPI)
|
||||
#include <mpi.h>
|
||||
#endif
|
||||
@ -20,18 +21,14 @@ typedef struct {
|
||||
int rank;
|
||||
int size;
|
||||
#if defined(_MPI)
|
||||
MPI_Comm comm;
|
||||
|
||||
int num_external;
|
||||
int num_send_neighbors;
|
||||
int* external_index;
|
||||
int* external_local_index;
|
||||
int total_to_be_sent;
|
||||
int* elements_to_send;
|
||||
int numNeighbors;
|
||||
int numExternal;
|
||||
int totalSendCount;
|
||||
int* elementsToSend;
|
||||
int neighbors[MAX_NUM_NEIGHBOURS];
|
||||
int recv_length[MAX_NUM_NEIGHBOURS];
|
||||
int send_length[MAX_NUM_NEIGHBOURS];
|
||||
double* send_buffer;
|
||||
int recvCount[MAX_NUM_NEIGHBOURS];
|
||||
int sendCount[MAX_NUM_NEIGHBOURS];
|
||||
CG_FLOAT* send_buffer;
|
||||
#endif
|
||||
} Comm;
|
||||
|
||||
|
@ -55,9 +55,9 @@ int main(int argc, char** argv)
|
||||
CG_FLOAT eps = (CG_FLOAT)param.eps;
|
||||
int itermax = param.itermax;
|
||||
initSolver(&s, &comm, ¶m);
|
||||
commMatrixDump(&comm, &s.A);
|
||||
commFinalize(&comm);
|
||||
exit(EXIT_SUCCESS);
|
||||
// commMatrixDump(&comm, &s.A);
|
||||
commPartition(&comm, &s.A);
|
||||
commPrintConfig(&comm);
|
||||
|
||||
CG_UINT nrow = s.A.nr;
|
||||
CG_UINT ncol = s.A.nc;
|
||||
|
16
src/solver.c
16
src/solver.c
@ -109,7 +109,7 @@ static void matrixGenerate(
|
||||
#ifdef VERBOSE
|
||||
printf("Process %d of %d has %d rows\n", rank, size, local_nrow);
|
||||
printf("Global rows %d through %d\n", start_row, stop_row);
|
||||
printf("%d nonzeros\n", start_row, stop_row);
|
||||
printf("%d nonzeros\n", local_nnz);
|
||||
#endif /* ifdef VERBOSE */
|
||||
|
||||
s->A.startRow = start_row;
|
||||
@ -139,11 +139,11 @@ void spMVM(Matrix* m, const CG_FLOAT* restrict x, CG_FLOAT* restrict y)
|
||||
CG_UINT* colInd = m->colInd;
|
||||
CG_FLOAT* val = m->val;
|
||||
|
||||
for (size_t rowID = 0; rowID < numRows; rowID++) {
|
||||
for (int rowID = 0; rowID < numRows; rowID++) {
|
||||
CG_FLOAT tmp = y[rowID];
|
||||
|
||||
// loop over all elements in row
|
||||
for (size_t rowEntry = rowPtr[rowID]; rowEntry < rowPtr[rowID + 1];
|
||||
for (int rowEntry = rowPtr[rowID]; rowEntry < rowPtr[rowID + 1];
|
||||
rowEntry++) {
|
||||
tmp += val[rowEntry] * x[colInd[rowEntry]];
|
||||
}
|
||||
@ -160,15 +160,15 @@ void waxpby(const CG_UINT n,
|
||||
CG_FLOAT* const w)
|
||||
{
|
||||
if (alpha == 1.0) {
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
for (int i = 0; i < n; i++) {
|
||||
w[i] = x[i] + beta * y[i];
|
||||
}
|
||||
} else if (beta == 1.0) {
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
for (int i = 0; i < n; i++) {
|
||||
w[i] = alpha * x[i] + y[i];
|
||||
}
|
||||
} else {
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
for (int i = 0; i < n; i++) {
|
||||
w[i] = alpha * x[i] + beta * y[i];
|
||||
}
|
||||
}
|
||||
@ -182,11 +182,11 @@ void ddot(const CG_UINT n,
|
||||
CG_FLOAT sum = 0.0;
|
||||
|
||||
if (y == x) {
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
for (int i = 0; i < n; i++) {
|
||||
sum += x[i] * x[i];
|
||||
}
|
||||
} else {
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
for (int i = 0; i < n; i++) {
|
||||
sum += x[i] * y[i];
|
||||
}
|
||||
}
|
||||
|
@ -23,13 +23,16 @@
|
||||
#define MAXLINE 4096
|
||||
#endif
|
||||
|
||||
#define CG_UINT unsigned long long int
|
||||
// #define CG_UINT unsigned long long int
|
||||
#define CG_UINT int
|
||||
|
||||
#if PRECISION == 1
|
||||
#define CG_FLOAT float
|
||||
#define MPI_FLOAT_TYPE MPI_FLOAT
|
||||
#define PRECISION_STRING "single"
|
||||
#else
|
||||
#define CG_FLOAT double
|
||||
#define MPI_FLOAT_TYPE MPI_DOUBLE
|
||||
#define PRECISION_STRING "double"
|
||||
#endif
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user