From 49dc2d97aeedb3ca19e59963e9b5eec27f61c95b Mon Sep 17 00:00:00 2001 From: Jan Eitzinger Date: Mon, 20 Jan 2025 18:24:37 +0100 Subject: [PATCH] Fix bugs and clean up. Add subroutines. --- hpcg.par | 6 +- mk/include_CLANG.mk | 2 +- src/comm.c | 687 ++++++++++++++++++++++++-------------------- src/comm.h | 19 +- src/main.c | 6 +- src/solver.c | 16 +- src/util.h | 5 +- 7 files changed, 396 insertions(+), 345 deletions(-) diff --git a/hpcg.par b/hpcg.par index ce11b32..ed20133 100644 --- a/hpcg.par +++ b/hpcg.par @@ -3,9 +3,9 @@ #============================================================================== filename generate -nx 50 -ny 50 -nz 50 +nx 10 +ny 10 +nz 10 itermax 10000 eps 0.0001 diff --git a/mk/include_CLANG.mk b/mk/include_CLANG.mk index b3b2415..d0641ab 100644 --- a/mk/include_CLANG.mk +++ b/mk/include_CLANG.mk @@ -17,5 +17,5 @@ VERSION = --version CFLAGS = -O3 -ffast-math -std=c99 $(OPENMP) #CFLAGS = -Ofast -fnt-store=aggressive -std=c99 $(OPENMP) #AMD CLANG LFLAGS = $(OPENMP) -DEFINES += -D_GNU_SOURCE +DEFINES += -D_GNU_SOURCE# -DVERBOSE INCLUDES = -I/Users/jan/.local/include diff --git a/src/comm.c b/src/comm.c index 7957178..a006daa 100644 --- a/src/comm.c +++ b/src/comm.c @@ -3,6 +3,7 @@ * Use of this source code is governed by a MIT style * license that can be found in the LICENSE file. */ #include "util.h" +#include #include #include @@ -19,33 +20,179 @@ static int sizeOfRank(int rank, int size, int N) return N / size + ((N % size > rank) ? 1 : 0); } -void commReduction(double* v, int op) +// Ensure that all the neighbors we expect to receive from also send to us +// sendList - All ranks receive values from us +// numSendNeighbors - Number of entries in sendList +// recvList - All ranks that send values from us +// numRecvNeighbors - Number of entries in recvList +// FIXME: What if ranks want to send to us and are not in sendlist? +static void probeNeighbors( + int* sendList, int numSendNeighbors, int* recvList, int numRecvNeighbors) { -#ifdef _MPI - if (op == MAX) { - MPI_Allreduce(MPI_IN_PLACE, v, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); - } else if (op == SUM) { - MPI_Allreduce(MPI_IN_PLACE, v, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + + int val; + + for (int i = 0; i < numSendNeighbors; i++) { + sendList[i] = -1; + } + + int MPI_MY_TAG = 99; + MPI_Request request[MAX_NUM_MESSAGES]; + + for (int i = 0; i < numSendNeighbors; i++) { + MPI_Irecv(&val, + 1, + MPI_INT, + MPI_ANY_SOURCE, + MPI_MY_TAG, + MPI_COMM_WORLD, + request + i); + } + + for (int i = 0; i < numRecvNeighbors; i++) { + MPI_Send(&val, 1, MPI_INT, recvList[i], MPI_MY_TAG, MPI_COMM_WORLD); + } + + // Receive message from each send neighbor to construct 'sendList'. + MPI_Status status; + for (int i = 0; i < numSendNeighbors; i++) { + if (MPI_Wait(request + i, &status)) { + printf("MPI_Wait error\n"); + MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); + exit(EXIT_FAILURE); + } + sendList[i] = status.MPI_SOURCE; + } +} + +static void buildNeighbors(Comm* c, int* external_processor) +{ + int numNeighbors = c->numNeighbors; + int numExternal = c->numExternal; + int* neighbors = c->neighbors; + int lengths[numNeighbors]; + int MPI_MY_TAG = 100; + MPI_Request request[MAX_NUM_MESSAGES]; + + // First post receives + for (int i = 0; i < numNeighbors; i++) { + MPI_Irecv(lengths + i, + 1, + MPI_INT, + neighbors[i], + MPI_MY_TAG, + MPI_COMM_WORLD, + request + i); + } + + int* recvCount = c->recvCount; + int* sendCount = c->sendCount; + + int j = 0; + + for (int i = 0; i < numNeighbors; i++) { + int count = 0; + + // go through list of external elements until updating + // processor changes + while ((j < numExternal) && (external_processor[j] == neighbors[i])) { + count++; + j++; + if (j == numExternal) break; + } + + recvCount[i] = count; + MPI_Send(&count, 1, MPI_INT, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD); + } + + MPI_Status status; + // Complete the receives of the number of externals + for (int i = 0; i < numNeighbors; i++) { + if (MPI_Wait(request + i, &status)) { + printf("MPI_Wait error\n"); + exit(EXIT_FAILURE); + } + sendCount[i] = lengths[i]; + } +} + +static void buildElementsToSend( + Comm* c, int startRow, int* external_processor, int* new_external) +{ + int numNeighbors = c->numNeighbors; + int numExternal = c->numExternal; + int* neighbors = c->neighbors; + int MPI_MY_TAG = 100; + MPI_Request request[MAX_NUM_MESSAGES]; + c->elementsToSend = (int*)allocate(ARRAY_ALIGNMENT, + c->totalSendCount * sizeof(int)); + int* elementsToSend = c->elementsToSend; + + int j = 0; + + for (int i = 0; i < numNeighbors; i++) { + MPI_Irecv(elementsToSend + j, + c->sendCount[i], + MPI_INT, + neighbors[i], + MPI_MY_TAG, + MPI_COMM_WORLD, + request + i); + + j += c->sendCount[i]; + } + + j = 0; + + for (int i = 0; i < numNeighbors; i++) { + int start = j; + + // Go through list of external elements + // until updating processor changes. This is redundant, but + // saves us from recording this information. + while ((j < numExternal) && (external_processor[j] == neighbors[i])) { + j++; + if (j == numExternal) break; + } + + MPI_Send(new_external + start, + j - start, + MPI_INT, + neighbors[i], + MPI_MY_TAG, + MPI_COMM_WORLD); + } + + MPI_Status status; + // receive from each neighbor the global index list of external elements + for (int i = 0; i < numNeighbors; i++) { + if (MPI_Wait(request + i, &status)) { + printf("MPI_Wait error\n"); + exit(EXIT_FAILURE); + } + } + + /// replace global indices by local indices + for (int i = 0; i < c->totalSendCount; i++) { + elementsToSend[i] -= startRow; } -#endif } void commPartition(Comm* c, Matrix* A) { #ifdef _MPI - int rank = c->rank; - int size = c->size; - MPI_Comm comm = c->comm; + int rank = c->rank; + int size = c->size; // Extract Matrix pieces - int start_row = A->startRow; - int stop_row = A->stopRow; - int total_nrow = A->totalNr; - long long total_nnz = A->totalNnz; - int local_nrow = A->nr; - int local_nnz = A->nnz; - int* row_ptr = (int*)A->rowPtr; - int* col_ind = (int*)A->colInd; + CG_UINT start_row = A->startRow; + CG_UINT stop_row = A->stopRow; + CG_UINT total_nrow = A->totalNr; + CG_UINT total_nnz = A->totalNnz; + CG_UINT local_nrow = A->nr; + CG_UINT local_nnz = A->nnz; + CG_UINT* row_ptr = A->rowPtr; + CG_UINT* col_ind = A->colInd; // We need to convert the index values for the rows on this processor // to a local index space. We need to: @@ -58,48 +205,54 @@ void commPartition(Comm* c, Matrix* A) // - find out which processor owns the value. // - Set up communication for sparse MV operation. - // Scan the indices and transform to local - int* externals = (int*)allocate(ARRAY_ALIGNMENT, A->totalNr * sizeof(int)); - int num_external = 1; + // FIXME: Use a lookup table with size total number of rows. For lower memory + // consumption a map would be better choice. + int* externals = (int*)allocate(ARRAY_ALIGNMENT, total_nrow * sizeof(int)); + int num_external = 0; // local number of external indices + + // column indices that are not processed yet are marked with -1 + for (int i = 0; i < total_nrow; i++) { + externals[i] = -1; + } int* external_index = (int*)allocate(ARRAY_ALIGNMENT, MAX_EXTERNAL * sizeof(int)); - c->external_index = external_index; - - for (int i = 0; i < A->totalNr; i++) { - externals[i] = -1; - } for (int i = 0; i < local_nrow; i++) { for (int j = row_ptr[i]; j < row_ptr[i + 1]; j++) { int cur_ind = A->colInd[j]; #ifdef VERBOSE - printf("Process %d of %d getting index %d in local row %d\n", + printf("Rank %d of %d getting entry %d:index %d in local row %d\n", rank, size, + j, cur_ind, i); #endif - // shift local rows to the start + // convert local column references to local numbering if (start_row <= cur_ind && cur_ind <= stop_row) { col_ind[j] -= start_row; } else { - // Must find out if we have already set up this point + // find out if we have already set up this point if (externals[cur_ind] == -1) { - externals[cur_ind] = num_external++; + externals[cur_ind] = num_external; if (num_external <= MAX_EXTERNAL) { - external_index[num_external - 1] = cur_ind; - // Mark index as external by negating it - col_ind[j] = -(col_ind[j] + 1); // FIXME: Offset? + external_index[num_external] = cur_ind; + // mark in local column index as external by negating it + // col_ind[j] = -(col_ind[j] + 1); // FIXME: Offset? + col_ind[j] = -col_ind[j]; } else { printf("Must increase MAX_EXTERNAL\n"); + MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); exit(EXIT_FAILURE); } + num_external++; } else { - // Mark index as external by negating it + // Mark index as external by adding 1 and negating it + // col_ind[j] = -(col_ind[j] + 1); // FIXME: Offset? col_ind[j] = -col_ind[j]; } } @@ -109,44 +262,42 @@ void commPartition(Comm* c, Matrix* A) /************************************************************************** Go through list of externals to find out which processors must be accessed. **************************************************************************/ - c->num_external = num_external; - int tmp_buffer[size]; - int global_index_offsets[size]; - - for (int i = 0; i < size; i++) { - tmp_buffer[i] = 0; - } - - tmp_buffer[rank] = start_row; - - MPI_Allreduce(tmp_buffer, - global_index_offsets, - size, - MPI_INT, - MPI_SUM, - MPI_COMM_WORLD); - - // Go through list of externals and find the processor that owns each int external_processor[num_external]; - for (int i = 0; i < num_external; i++) { - int cur_ind = external_index[i]; - for (int j = size - 1; j >= 0; j--) { - if (global_index_offsets[j] <= cur_ind) { - external_processor[i] = j; - break; + { + int globalIndexOffsets[size]; + + MPI_Allgather(&start_row, + 1, + MPI_INT, + globalIndexOffsets, + 1, + MPI_INT, + MPI_COMM_WORLD); + + // for (int i = 0; i < size; i++) { + // printf("Rank %d: i = %d: OFFSET %d\n", rank, i, globalIndexOffsets[i]); + // } + + // Go through list of externals and find the processor that owns each + + for (int i = 0; i < num_external; i++) { + int globalIndex = external_index[i]; + for (int j = size - 1; j >= 0; j--) { + if (globalIndexOffsets[j] <= globalIndex) { + external_processor[i] = j; + break; + } } } } - /*Go through the external elements. For each newly encountered external point assign it the next index in the local sequence. Then look for other - external elements who are updated by the same node and assign them the next + external elements who are updated by the same rank and assign them the next set of index numbers in the local sequence (ie. elements updated by the same - node have consecutive indices).*/ + rank have consecutive indices).*/ int* external_local_index = (int*)allocate(ARRAY_ALIGNMENT, - MAX_EXTERNAL * sizeof(int)); - c->external_local_index = external_local_index; + num_external * sizeof(int)); int count = local_nrow; @@ -172,27 +323,39 @@ void commPartition(Comm* c, Matrix* A) for (int i = 0; i < local_nrow; i++) { for (int j = rowPtr[i]; j < rowPtr[i + 1]; j++) { if (col_ind[j] < 0) { - int cur_ind = -col_ind[j] - 1; // FIXME: Offset by 1?? + // size_t cur_ind = -(col_ind[j] - 1); // FIXME: Offset by 1?? + int cur_ind = -col_ind[j]; col_ind[j] = external_local_index[externals[cur_ind]]; } } } - int new_external_processor[num_external]; - - for (int i = 0; i < num_external; i++) { - new_external_processor[i] = 0; - } - - // setup map from external id to partition - for (int i = 0; i < num_external; i++) { - new_external_processor[external_local_index[i] - local_nrow] = - external_processor[i]; - } - + free(externals); + // int new_external_processor[num_external]; + // + // for (int i = 0; i < num_external; i++) { + // new_external_processor[i] = -1; + // } + // + // // setup map from external id to partition + // for (int i = 0; i < num_external; i++) { + // int id = external_local_index[i] - local_nrow; + // new_external_processor[id] = external_processor[i]; + // printf("Rank %d of %d: %d new_external_processor[%d] = %d\n", + // rank, + // size, + // i, + // id, + // external_processor[i]); + // } + // commFinalize(c); + // exit(EXIT_SUCCESS); + // #ifdef VERBOSE + printf("Rank %d of %d: %d externals\n", rank, size, num_external); + for (int i = 0; i < num_external; i++) { - printf("Process %d of %d: external process[%d] = %d\n", + printf("Rank %d of %d: external[%d] owned by %d\n", rank, size, i, @@ -217,119 +380,93 @@ void commPartition(Comm* c, Matrix* A) int num_recv_neighbors = 0; int length = 1; + // Encoding both number of ranks that need values from this rank and the total + // number of values by adding one for any additional rank and adding size for + // every additional value. for (int i = 0; i < num_external; i++) { - if (tmp_neighbors[new_external_processor[i]] == 0) { + if (tmp_neighbors[external_processor[i]] == 0) { num_recv_neighbors++; - tmp_neighbors[new_external_processor[i]] = 1; + tmp_neighbors[external_processor[i]] = 1; } - tmp_neighbors[new_external_processor[i]] += size; + tmp_neighbors[external_processor[i]] += size; } // sum over all processors all the tmp_neighbors arrays - MPI_Allreduce(tmp_neighbors, - tmp_buffer, + MPI_Allreduce(MPI_IN_PLACE, + tmp_neighbors, size, MPI_INT, MPI_SUM, MPI_COMM_WORLD); - /* decode the combined 'tmp_neighbors' (stored in tmp_buffer) array from all - * the processors */ - int num_send_neighbors = tmp_buffer[rank] % size; + /* decode the combined 'tmp_neighbors' array from all ranks */ + // Number of ranks that receive values from us + int num_send_neighbors = tmp_neighbors[rank] % size; - /* decode 'tmp_buffer[rank] to deduce total number of elements we must send */ - int total_to_be_sent = (tmp_buffer[rank] - num_send_neighbors) / size; + /* decode 'tmp_neighbors[rank] to deduce total number of elements we must send + */ + c->totalSendCount = (tmp_neighbors[rank] - num_send_neighbors) / size; - /* Check to see if we have enough workspace allocated. This could be + /* Check to see if we have enough memory allocated. This could be dynamically modified, but let's keep it simple for now...*/ if (num_send_neighbors > MAX_NUM_MESSAGES) { printf("Must increase MAX_NUM_MESSAGES. Must be at least %d\n", num_send_neighbors); + MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); exit(EXIT_FAILURE); } - if (total_to_be_sent > MAX_EXTERNAL) { + if (c->totalSendCount > MAX_EXTERNAL) { printf("Must increase MAX_EXTERNAL. Must be at least %d\n", - total_to_be_sent); + c->totalSendCount); + MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); exit(EXIT_FAILURE); } #ifdef VERBOSE - cout << "Processor " << rank << " of " << size - << ": Number of send neighbors = " << num_send_neighbors << endl; - - cout << "Processor " << rank << " of " << size - << ": Number of receive neighbors = " << num_recv_neighbors << endl; - - cout << "Processor " << rank << " of " << size - << ": Total number of elements to send = " << total_to_be_sent << endl; - + printf("Rank %d of %d: tmp_neighbors = %d\n", + rank, + size, + tmp_neighbors[rank]); + printf("Rank %d of %d: Number of send neighbors = %d\n", + rank, + size, + num_send_neighbors); + printf("Rank %d of %d: Number of receive neighbors = %d\n", + rank, + size, + num_recv_neighbors); + printf("Rank %d of %d: Total number of elements to send = %d\n", + rank, + size, + c->totalSendCount); MPI_Barrier(MPI_COMM_WORLD); #endif /* Make a list of the neighbors that will send information to update our external elements (in the order that we will receive this information).*/ - int* recv_list = allocate(ARRAY_ALIGNMENT, MAX_EXTERNAL * sizeof(int)); + int* recv_list = allocate(ARRAY_ALIGNMENT, MAX_NUM_MESSAGES * sizeof(int)); - // FIXME: Create local scope - int j = 0; - recv_list[j++] = new_external_processor[0]; + { + int j = 0; + recv_list[j++] = external_processor[0]; - for (int i = 1; i < num_external; i++) { - if (new_external_processor[i - 1] != new_external_processor[i]) { - recv_list[j++] = new_external_processor[i]; + for (int i = 1; i < num_external; i++) { + if (external_processor[i - 1] != external_processor[i]) { + recv_list[j++] = external_processor[i]; + } } } // Ensure that all the neighbors we expect to receive from also send to us - // Send a 0 length message to each of our recv neighbors int send_list[num_send_neighbors]; - for (int i = 0; i < num_send_neighbors; i++) { - send_list[i] = 0; - } - - // first post receives, these are immediate receives - // Do not wait for result to come, will do that at the - // wait call below. - int MPI_MY_TAG = 99; - - MPI_Request request[MAX_NUM_MESSAGES]; - - for (int i = 0; i < num_send_neighbors; i++) { - MPI_Irecv(tmp_buffer + i, - 1, - MPI_INT, - MPI_ANY_SOURCE, - MPI_MY_TAG, - MPI_COMM_WORLD, - request + i); - } - - // send messages - for (int i = 0; i < num_recv_neighbors; i++) { - MPI_Send(tmp_buffer + i, - 1, - MPI_INT, - recv_list[i], - MPI_MY_TAG, - MPI_COMM_WORLD); - } - - // Receive message from each send neighbor to construct 'send_list'. - MPI_Status status; - for (int i = 0; i < num_send_neighbors; i++) { - if (MPI_Wait(request + i, &status)) { - printf("MPI_Wait error\n"); - exit(EXIT_FAILURE); - } - send_list[i] = status.MPI_SOURCE; - } + probeNeighbors(send_list, num_send_neighbors, recv_list, num_recv_neighbors); /* Compare the two lists. In most cases they should be the same. // However, if they are not then add new entries to the recv list // that are in the send list (but not already in the recv list). - WHY!! This ensures that the sendlist is equal to the sendlist + FIXME: WHY!! This ensures that the recv_list is equal to the sendlist But why is this required? -> Just One neighbour list??*/ for (int j = 0; j < num_send_neighbors; j++) { int found = 0; @@ -343,29 +480,23 @@ void commPartition(Comm* c, Matrix* A) rank, size, num_recv_neighbors, - send_list[i]); + send_list[j]); #endif recv_list[num_recv_neighbors] = send_list[j]; - (num_recv_neighbors)++; + num_recv_neighbors++; } } - num_send_neighbors = num_recv_neighbors; - if (num_send_neighbors > MAX_NUM_MESSAGES) { - printf("Must increase MAX_EXTERNAL\n"); - exit(EXIT_FAILURE); + // From here on only have one neighbor list for both send and recv + c->numNeighbors = num_recv_neighbors; + for (int i = 0; i < c->numNeighbors; i++) { + c->neighbors[i] = recv_list[i]; } - // Start filling communication setup - // Create 'new_external' which explicitly put the external elements in the - // order given by 'external_local_index' - c->total_to_be_sent = total_to_be_sent; - int* elements_to_send = (int*)allocate(ARRAY_ALIGNMENT, - total_to_be_sent * sizeof(int)); - c->elements_to_send = elements_to_send; - - for (int i = 0; i < total_to_be_sent; i++) { - elements_to_send[i] = 0; + if (c->numNeighbors > MAX_NUM_MESSAGES) { + printf("Must increase MAX_EXTERNAL\n"); + MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); + exit(EXIT_FAILURE); } // Create 'new_external' which explicitly put the external elements in the @@ -377,195 +508,100 @@ void commPartition(Comm* c, Matrix* A) new_external[external_local_index[i] - local_nrow] = external_index[i]; } + free(external_local_index); + free(external_index); + c->numExternal = num_external; + + buildNeighbors(c, external_processor); + // Send each processor the global index list of the external elements in the // order that I will want to receive them when updating my external elements - int lengths[num_recv_neighbors]; - MPI_MY_TAG++; + // Build "elementsToSend" list. These are the x elements the current rank + // owns that need to be sent to other ranks + buildElementsToSend(c, A->startRow, external_processor, new_external); - // First post receives - for (int i = 0; i < num_recv_neighbors; i++) { - int partner = recv_list[i]; - MPI_Irecv(lengths + i, - 1, - MPI_INT, - partner, - MPI_MY_TAG, - MPI_COMM_WORLD, - request + i); - } - - int* neighbors = c->neighbors; - int* recv_length = c->recv_length; - int* send_length = c->send_length; - - j = 0; - - for (int i = 0; i < num_recv_neighbors; i++) { - int start = j; - int newlength = 0; - - // go through list of external elements until updating - // processor changes - while ((j < num_external) && (new_external_processor[j] == recv_list[i])) { - newlength++; - j++; - if (j == num_external) break; - } - - recv_length[i] = newlength; - neighbors[i] = recv_list[i]; - - length = j - start; - MPI_Send(&length, 1, MPI_INT, recv_list[i], MPI_MY_TAG, MPI_COMM_WORLD); - } - - // Complete the receives of the number of externals - for (int i = 0; i < num_recv_neighbors; i++) { - if (MPI_Wait(request + i, &status)) { - printf("MPI_Wait error\n"); - exit(EXIT_FAILURE); - } - send_length[i] = lengths[i]; - } - - // Build "elements_to_send" list. These are the x elements I own - // that need to be sent to other processors. - MPI_MY_TAG++; - - j = 0; - - for (int i = 0; i < num_recv_neighbors; i++) { - MPI_Irecv(elements_to_send + j, - send_length[i], - MPI_INT, - neighbors[i], - MPI_MY_TAG, - MPI_COMM_WORLD, - request + i); - j += send_length[i]; - } - - j = 0; - - for (int i = 0; i < num_recv_neighbors; i++) { - int start = j; - int newlength = 0; - - // Go through list of external elements - // until updating processor changes. This is redundant, but - // saves us from recording this information. - while ((j < num_external) && (new_external_processor[j] == recv_list[i])) { - - newlength++; - j++; - if (j == num_external) break; - } - MPI_Send(new_external + start, - j - start, - MPI_INT, - recv_list[i], - MPI_MY_TAG, - MPI_COMM_WORLD); - } - - // receive from each neighbor the global index list of external elements - for (int i = 0; i < num_recv_neighbors; i++) { - if (MPI_Wait(request + i, &status)) { - printf("MPI_Wait error\n"); - exit(EXIT_FAILURE); - } - } - - /// replace global indices by local indices - for (int i = 0; i < total_to_be_sent; i++) { - elements_to_send[i] -= start_row; - } - - // Finish up !! - c->num_send_neighbors = num_send_neighbors; - A->nc = A->nc + num_external; - - // Used in exchange - CG_FLOAT* send_buffer = (CG_FLOAT*)allocate(ARRAY_ALIGNMENT, - total_to_be_sent * sizeof(CG_FLOAT)); - c->send_buffer = send_buffer; + A->nc = A->nc + num_external; + c->send_buffer = (CG_FLOAT*)allocate(ARRAY_ALIGNMENT, + c->totalSendCount * sizeof(CG_FLOAT)); free(recv_list); free(new_external); #endif } -void commExchange(Comm* c, Matrix* A, double* x) +void commExchange(Comm* c, Matrix* A, CG_FLOAT* x) { #ifdef _MPI - int num_external = 0; + int numNeighbors = c->numNeighbors; + int* neighbors = c->neighbors; + int* recvCount = c->recvCount; + int* sendCount = c->sendCount; + CG_FLOAT* sendBuffer = c->send_buffer; + int* elementsToSend = c->elementsToSend; - // Extract Matrix pieces - - int local_nrow = A->nr; - int num_neighbors = c->num_send_neighbors; - int* recv_length = c->recv_length; - int* send_length = c->send_length; - int* neighbors = c->neighbors; - double* send_buffer = c->send_buffer; - int total_to_be_sent = c->total_to_be_sent; - int* elements_to_send = c->elements_to_send; - - int rank = c->rank; - int size = c->size; - MPI_Comm comm = c->comm; - - // first post receives, these are immediate receives - // Do not wait for result to come, will do that at the - // wait call below. int MPI_MY_TAG = 99; - - MPI_Request request[num_neighbors]; + MPI_Request request[numNeighbors]; // Externals are at end of locals - double* x_external = (double*)x + local_nrow; + CG_FLOAT* externals = x + A->nr; - // Post receives first - for (int i = 0; i < num_neighbors; i++) { - int n_recv = recv_length[i]; - MPI_Irecv(x_external, - n_recv, - MPI_DOUBLE, + // Post receives + for (int i = 0; i < numNeighbors; i++) { + int count = recvCount[i]; + + MPI_Irecv(externals, + count, + MPI_FLOAT_TYPE, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD, request + i); - x_external += n_recv; + + externals += count; } - // Fill up send buffer - for (int i = 0; i < total_to_be_sent; i++) { - send_buffer[i] = x[elements_to_send[i]]; + // Copy values for all ranks into send buffer + for (int i = 0; i < c->totalSendCount; i++) { + sendBuffer[i] = x[elementsToSend[i]]; } // Send to each neighbor - for (int i = 0; i < num_neighbors; i++) { - int n_send = send_length[i]; - MPI_Send(send_buffer, - n_send, - MPI_DOUBLE, + for (int i = 0; i < numNeighbors; i++) { + int count = sendCount[i]; + + MPI_Send(sendBuffer, + count, + MPI_FLOAT_TYPE, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD); - send_buffer += n_send; + + sendBuffer += count; } - // Complete the reads issued above + // Complete the receives issued above MPI_Status status; - for (int i = 0; i < num_neighbors; i++) { + for (int i = 0; i < numNeighbors; i++) { if (MPI_Wait(request + i, &status)) { printf("MPI_Wait error\n"); + MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); exit(EXIT_FAILURE); } } #endif } +void commReduction(double* v, int op) +{ +#ifdef _MPI + if (op == MAX) { + MPI_Allreduce(MPI_IN_PLACE, v, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); + } else if (op == SUM) { + MPI_Allreduce(MPI_IN_PLACE, v, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + } +#endif +} + void commPrintConfig(Comm* c) { #ifdef _MPI @@ -577,6 +613,21 @@ void commPrintConfig(Comm* c) for (int i = 0; i < c->size; i++) { if (i == c->rank) { + printf("Rank %d has %d neighbors with %d externals:\n", + c->rank, + c->numNeighbors, + c->numExternal); + for (int j = 0; j < c->numNeighbors; j++) { + printf("\t%d: receive %d send %d\n", + c->neighbors[j], + c->recvCount[j], + c->sendCount[j]); + } + printf("\tSend %d elements: [", c->totalSendCount); + for (int j = 0; j < c->totalSendCount; j++) { + printf("%d, ", c->elementsToSend[j]); + } + printf("]\n"); fflush(stdout); } MPI_Barrier(MPI_COMM_WORLD); @@ -594,14 +645,14 @@ void commMatrixDump(Comm* c, Matrix* m) CG_FLOAT* val = m->val; if (commIsMaster(c)) { - printf("Matrix: %lld total non zeroes, total number of rows %lld\n", + printf("Matrix: %d total non zeroes, total number of rows %d\n", m->totalNnz, m->totalNr); } for (int i = 0; i < size; i++) { if (i == rank) { - printf("Rank %d: %lld non zeroes, number of rows %lld\n", + printf("Rank %d: %d non zeroes, number of rows %d\n", rank, m->nnz, numRows); @@ -609,9 +660,9 @@ void commMatrixDump(Comm* c, Matrix* m) for (int rowID = 0; rowID < numRows; rowID++) { printf("Row [%d]: ", rowID); - for (size_t rowEntry = rowPtr[rowID]; rowEntry < rowPtr[rowID + 1]; + for (int rowEntry = rowPtr[rowID]; rowEntry < rowPtr[rowID + 1]; rowEntry++) { - printf("[%lld]:%.2f ", colInd[rowEntry], val[rowEntry]); + printf("[%d]:%.2f ", colInd[rowEntry], val[rowEntry]); } printf("\n"); diff --git a/src/comm.h b/src/comm.h index 7821ff9..1bff459 100644 --- a/src/comm.h +++ b/src/comm.h @@ -4,6 +4,7 @@ * license that can be found in the LICENSE file. */ #ifndef __COMM_H_ #define __COMM_H_ +#include "util.h" #if defined(_MPI) #include #endif @@ -20,18 +21,14 @@ typedef struct { int rank; int size; #if defined(_MPI) - MPI_Comm comm; - - int num_external; - int num_send_neighbors; - int* external_index; - int* external_local_index; - int total_to_be_sent; - int* elements_to_send; + int numNeighbors; + int numExternal; + int totalSendCount; + int* elementsToSend; int neighbors[MAX_NUM_NEIGHBOURS]; - int recv_length[MAX_NUM_NEIGHBOURS]; - int send_length[MAX_NUM_NEIGHBOURS]; - double* send_buffer; + int recvCount[MAX_NUM_NEIGHBOURS]; + int sendCount[MAX_NUM_NEIGHBOURS]; + CG_FLOAT* send_buffer; #endif } Comm; diff --git a/src/main.c b/src/main.c index 31afc64..d2c63eb 100644 --- a/src/main.c +++ b/src/main.c @@ -55,9 +55,9 @@ int main(int argc, char** argv) CG_FLOAT eps = (CG_FLOAT)param.eps; int itermax = param.itermax; initSolver(&s, &comm, ¶m); - commMatrixDump(&comm, &s.A); - commFinalize(&comm); - exit(EXIT_SUCCESS); + // commMatrixDump(&comm, &s.A); + commPartition(&comm, &s.A); + commPrintConfig(&comm); CG_UINT nrow = s.A.nr; CG_UINT ncol = s.A.nc; diff --git a/src/solver.c b/src/solver.c index c071190..81e1fed 100644 --- a/src/solver.c +++ b/src/solver.c @@ -109,7 +109,7 @@ static void matrixGenerate( #ifdef VERBOSE printf("Process %d of %d has %d rows\n", rank, size, local_nrow); printf("Global rows %d through %d\n", start_row, stop_row); - printf("%d nonzeros\n", start_row, stop_row); + printf("%d nonzeros\n", local_nnz); #endif /* ifdef VERBOSE */ s->A.startRow = start_row; @@ -139,11 +139,11 @@ void spMVM(Matrix* m, const CG_FLOAT* restrict x, CG_FLOAT* restrict y) CG_UINT* colInd = m->colInd; CG_FLOAT* val = m->val; - for (size_t rowID = 0; rowID < numRows; rowID++) { + for (int rowID = 0; rowID < numRows; rowID++) { CG_FLOAT tmp = y[rowID]; // loop over all elements in row - for (size_t rowEntry = rowPtr[rowID]; rowEntry < rowPtr[rowID + 1]; + for (int rowEntry = rowPtr[rowID]; rowEntry < rowPtr[rowID + 1]; rowEntry++) { tmp += val[rowEntry] * x[colInd[rowEntry]]; } @@ -160,15 +160,15 @@ void waxpby(const CG_UINT n, CG_FLOAT* const w) { if (alpha == 1.0) { - for (size_t i = 0; i < n; i++) { + for (int i = 0; i < n; i++) { w[i] = x[i] + beta * y[i]; } } else if (beta == 1.0) { - for (size_t i = 0; i < n; i++) { + for (int i = 0; i < n; i++) { w[i] = alpha * x[i] + y[i]; } } else { - for (size_t i = 0; i < n; i++) { + for (int i = 0; i < n; i++) { w[i] = alpha * x[i] + beta * y[i]; } } @@ -182,11 +182,11 @@ void ddot(const CG_UINT n, CG_FLOAT sum = 0.0; if (y == x) { - for (size_t i = 0; i < n; i++) { + for (int i = 0; i < n; i++) { sum += x[i] * x[i]; } } else { - for (size_t i = 0; i < n; i++) { + for (int i = 0; i < n; i++) { sum += x[i] * y[i]; } } diff --git a/src/util.h b/src/util.h index 1bc160d..8bc38ca 100644 --- a/src/util.h +++ b/src/util.h @@ -23,13 +23,16 @@ #define MAXLINE 4096 #endif -#define CG_UINT unsigned long long int +// #define CG_UINT unsigned long long int +#define CG_UINT int #if PRECISION == 1 #define CG_FLOAT float +#define MPI_FLOAT_TYPE MPI_FLOAT #define PRECISION_STRING "single" #else #define CG_FLOAT double +#define MPI_FLOAT_TYPE MPI_DOUBLE #define PRECISION_STRING "double" #endif