/* Copyright (C) NHR@FAU, University Erlangen-Nuremberg. * All rights reserved. This file is part of nusif-solver. * Use of this source code is governed by a MIT style * license that can be found in the LICENSE file. */ #include "util.h" #include #include #ifdef _MPI #include #endif #include "allocate.h" #include "comm.h" // subroutines local to this module int sizeOfRank(int rank, int size, int N) { return N / size + ((N % size > rank) ? 1 : 0); } void commReduction(double* v, int op) { #ifdef _MPI if (op == MAX) { MPI_Allreduce(MPI_IN_PLACE, v, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); } else if (op == SUM) { MPI_Allreduce(MPI_IN_PLACE, v, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); } #endif } void commPartition(Comm* c, Matrix* A) { #ifdef _MPI int rank = c->rank; int size = c->size; MPI_Comm comm = c->comm; // Extract Matrix pieces int start_row = A->startRow; int stop_row = A->stopRow; int total_nrow = A->totalNr; long long total_nnz = A->totalNnz; int local_nrow = A->nr; int local_nnz = A->nnz; int* row_ptr = (int*)A->rowPtr; int* col_ind = (int*)A->colInd; // int* nnz_in_row = A->nnz_in_row; // double** ptr_to_vals_in_row = A->ptr_to_vals_in_row; // int** ptr_to_inds_in_row = A->ptr_to_inds_in_row; // We need to convert the index values for the rows on this processor // to a local index space. We need to: // - Determine if each index reaches to a local value or external value // - If local, subtract start_row from index value to get local index // - If external, find out if it is already accounted for. // - If so, then do nothing, // - otherwise // - add it to the list of external indices, // - find out which processor owns the value. // - Set up communication for sparse MV operation. // Scan the indices and transform to local int* externals = (int*)allocate(ARRAY_ALIGNMENT, A->totalNr * sizeof(int)); int num_external = 1; int* external_index = (int*)allocate(ARRAY_ALIGNMENT, MAX_EXTERNAL * sizeof(int)); c->external_index = external_index; for (int i = 0; i < A->totalNr; i++) { externals[i] = -1; } for (int i = 0; i < local_nrow; i++) { for (int j = row_ptr[i]; j < row_ptr[i + 1]; j++) { int cur_ind = A->colInd[j]; #ifdef VERBOSE printf("Process %d of %d getting index %d in local row %d\n", rank, size, cur_ind, i); #endif // shift local rows to the start if (start_row <= cur_ind && cur_ind <= stop_row) { col_ind[j] -= start_row; } else { // Must find out if we have already set up this point if (externals[cur_ind] == -1) { externals[cur_ind] = num_external++; if (num_external <= MAX_EXTERNAL) { external_index[num_external - 1] = cur_ind; // Mark index as external by negating it col_ind[j] = -col_ind[j]; } else { printf("Must increase MAX_EXTERNAL\n"); exit(EXIT_FAILURE); } } else { // Mark index as external by negating it col_ind[j] = -col_ind[j]; } } } } /************************************************************************** Go through list of externals to find out which processors must be accessed. **************************************************************************/ c->num_external = num_external; int tmp_buffer[size]; int global_index_offsets[size]; for (int i = 0; i < size; i++) { tmp_buffer[i] = 0; } tmp_buffer[rank] = start_row; MPI_Allreduce(tmp_buffer, global_index_offsets, size, MPI_INT, MPI_SUM, MPI_COMM_WORLD); // Go through list of externals and find the processor that owns each int external_processor[num_external]; for (int i = 0; i < num_external; i++) { int cur_ind = external_index[i]; for (int j = size - 1; j >= 0; j--) { if (global_index_offsets[j] <= cur_ind) { external_processor[i] = j; break; } } } /*Go through the external elements. For each newly encountered external point assign it the next index in the local sequence. Then look for other external elements who are updated by the same node and assign them the next set of index numbers in the local sequence (ie. elements updated by the same node have consecutive indices).*/ int* external_local_index = (int*)allocate(ARRAY_ALIGNMENT, MAX_EXTERNAL * sizeof(int)); c->external_local_index = external_local_index; int count = local_nrow; for (int i = 0; i < num_external; i++) { external_local_index[i] = -1; } for (int i = 0; i < num_external; i++) { if (external_local_index[i] == -1) { external_local_index[i] = count++; for (int j = i + 1; j < num_external; j++) { if (external_processor[j] == external_processor[i]) { external_local_index[j] = count++; } } } } // map all external ids to the new local index CG_UINT* rowPtr = A->rowPtr; for (int i = 0; i < local_nrow; i++) { for (int j = rowPtr[i]; j < rowPtr[i + 1]; j++) { if (col_ind[j] < 0) { int cur_ind = -col_ind[j] - 1; // FIXME: Offset by 1?? col_ind[j] = external_local_index[externals[cur_ind]]; } } } int new_external_processor[num_external]; for (int i = 0; i < num_external; i++) { new_external_processor[i] = 0; } // setup map from external id to partition for (int i = 0; i < num_external; i++) { new_external_processor[external_local_index[i] - local_nrow] = external_processor[i]; } #ifdef VERBOSE for (int i = 0; i < num_external; i++) { printf("Process %d of %d: external process[%d] = %d\n", rank, size, i, external_processor[i]); } #endif /* Count the number of neighbors from which we receive information to update our external elements. Additionally, fill the array tmp_neighbors in the following way: tmp_neighbors[i] = 0 ==> No external elements are updated by processor i. tmp_neighbors[i] = x ==> (x-1)/size elements are updated from processor i.*/ int tmp_neighbors[size]; for (int i = 0; i < size; i++) { tmp_neighbors[i] = 0; } int num_recv_neighbors = 0; int length = 1; for (int i = 0; i < num_external; i++) { if (tmp_neighbors[new_external_processor[i]] == 0) { num_recv_neighbors++; tmp_neighbors[new_external_processor[i]] = 1; } tmp_neighbors[new_external_processor[i]] += size; } // sum over all processors all the tmp_neighbors arrays MPI_Allreduce(tmp_neighbors, tmp_buffer, size, MPI_INT, MPI_SUM, MPI_COMM_WORLD); /* decode the combined 'tmp_neighbors' (stored in tmp_buffer) array from all the * processors */ int num_send_neighbors = tmp_buffer[rank] % size; /* decode 'tmp_buffer[rank] to deduce total number of elements we must send */ int total_to_be_sent = (tmp_buffer[rank] - num_send_neighbors) / size; /* Check to see if we have enough workspace allocated. This could be dynamically modified, but let's keep it simple for now...*/ if (num_send_neighbors > MAX_NUM_MESSAGES) { printf("Must increase MAX_NUM_MESSAGES. Must be at least %d\n", num_send_neighbors); exit(EXIT_FAILURE); } if (total_to_be_sent > MAX_EXTERNAL) { printf("Must increase MAX_EXTERNAL. Must be at least %d\n", total_to_be_sent); exit(EXIT_FAILURE); } #ifdef VERBOSE cout << "Processor " << rank << " of " << size << ": Number of send neighbors = " << num_send_neighbors << endl; cout << "Processor " << rank << " of " << size << ": Number of receive neighbors = " << num_recv_neighbors << endl; cout << "Processor " << rank << " of " << size << ": Total number of elements to send = " << total_to_be_sent << endl; MPI_Barrier(MPI_COMM_WORLD); #endif /* Make a list of the neighbors that will send information to update our external elements (in the order that we will receive this information).*/ int* recv_list = allocate(ARRAY_ALIGNMENT, MAX_EXTERNAL * sizeof(int)); // FIXME: Create local scope int j = 0; recv_list[j++] = new_external_processor[0]; for (int i = 1; i < num_external; i++) { if (new_external_processor[i - 1] != new_external_processor[i]) { recv_list[j++] = new_external_processor[i]; } } // Ensure that all the neighbors we expect to receive from also send to us // Send a 0 length message to each of our recv neighbors int send_list[num_send_neighbors]; for (int i = 0; i < num_send_neighbors; i++) { send_list[i] = 0; } // first post receives, these are immediate receives // Do not wait for result to come, will do that at the // wait call below. int MPI_MY_TAG = 99; MPI_Request request[MAX_NUM_MESSAGES]; for (int i = 0; i < num_send_neighbors; i++) { MPI_Irecv(tmp_buffer + i, 1, MPI_INT, MPI_ANY_SOURCE, MPI_MY_TAG, MPI_COMM_WORLD, request + i); } // send messages for (int i = 0; i < num_recv_neighbors; i++) { MPI_Send(tmp_buffer + i, 1, MPI_INT, recv_list[i], MPI_MY_TAG, MPI_COMM_WORLD); } // Receive message from each send neighbor to construct 'send_list'. MPI_Status status; for (int i = 0; i < num_send_neighbors; i++) { if (MPI_Wait(request + i, &status)) { printf("MPI_Wait error\n"); exit(EXIT_FAILURE); } send_list[i] = status.MPI_SOURCE; } /* Compare the two lists. In most cases they should be the same. // However, if they are not then add new entries to the recv list // that are in the send list (but not already in the recv list). WHY!! This ensures that the sendlist is equal to the sendlist But why is this required? -> Just One neighbour list??*/ for (int j = 0; j < num_send_neighbors; j++) { int found = 0; for (int i = 0; i < num_recv_neighbors; i++) { if (recv_list[i] == send_list[j]) found = 1; } if (found == 0) { #ifdef VERBOSE printf("Process %d of %d: recv_list[%d] = %d\n", rank, size, num_recv_neighbors, send_list[i]); #endif recv_list[num_recv_neighbors] = send_list[j]; (num_recv_neighbors)++; } } num_send_neighbors = num_recv_neighbors; if (num_send_neighbors > MAX_NUM_MESSAGES) { printf("Must increase MAX_EXTERNAL\n"); exit(EXIT_FAILURE); } // Start filling communication setup // Create 'new_external' which explicitly put the external elements in the // order given by 'external_local_index' c->total_to_be_sent = total_to_be_sent; int* elements_to_send = (int*)allocate(ARRAY_ALIGNMENT, total_to_be_sent * sizeof(int)); c->elements_to_send = elements_to_send; for (int i = 0; i < total_to_be_sent; i++) { elements_to_send[i] = 0; } // Create 'new_external' which explicitly put the external elements in the // order given by 'external_local_index' int* new_external = (int*)allocate(ARRAY_ALIGNMENT, num_external * sizeof(int)); for (int i = 0; i < num_external; i++) { new_external[external_local_index[i] - local_nrow] = external_index[i]; } // Send each processor the global index list of the external elements in the // order that I will want to receive them when updating my external elements int lengths[num_recv_neighbors]; MPI_MY_TAG++; // First post receives for (int i = 0; i < num_recv_neighbors; i++) { int partner = recv_list[i]; MPI_Irecv(lengths + i, 1, MPI_INT, partner, MPI_MY_TAG, MPI_COMM_WORLD, request + i); } int* neighbors = c->neighbors; int* recv_length = c->recv_length; int* send_length = c->send_length; j = 0; for (int i = 0; i < num_recv_neighbors; i++) { int start = j; int newlength = 0; // go through list of external elements until updating // processor changes while ((j < num_external) && (new_external_processor[j] == recv_list[i])) { newlength++; j++; if (j == num_external) break; } recv_length[i] = newlength; neighbors[i] = recv_list[i]; length = j - start; MPI_Send(&length, 1, MPI_INT, recv_list[i], MPI_MY_TAG, MPI_COMM_WORLD); } // Complete the receives of the number of externals for (int i = 0; i < num_recv_neighbors; i++) { if (MPI_Wait(request + i, &status)) { printf("MPI_Wait error\n"); exit(EXIT_FAILURE); } send_length[i] = lengths[i]; } // Build "elements_to_send" list. These are the x elements I own // that need to be sent to other processors. MPI_MY_TAG++; j = 0; for (int i = 0; i < num_recv_neighbors; i++) { MPI_Irecv(elements_to_send + j, send_length[i], MPI_INT, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD, request + i); j += send_length[i]; } j = 0; for (int i = 0; i < num_recv_neighbors; i++) { int start = j; int newlength = 0; // Go through list of external elements // until updating processor changes. This is redundant, but // saves us from recording this information. while ((j < num_external) && (new_external_processor[j] == recv_list[i])) { newlength++; j++; if (j == num_external) break; } MPI_Send(new_external + start, j - start, MPI_INT, recv_list[i], MPI_MY_TAG, MPI_COMM_WORLD); } // receive from each neighbor the global index list of external elements for (int i = 0; i < num_recv_neighbors; i++) { if (MPI_Wait(request + i, &status)) { printf("MPI_Wait error\n"); exit(EXIT_FAILURE); } } /// replace global indices by local indices for (int i = 0; i < total_to_be_sent; i++) { elements_to_send[i] -= start_row; } // Finish up !! c->num_send_neighbors = num_send_neighbors; A->nc = A->nc + num_external; // Used in exchange CG_FLOAT* send_buffer = (CG_FLOAT*)allocate(ARRAY_ALIGNMENT, total_to_be_sent * sizeof(CG_FLOAT)); c->send_buffer = send_buffer; free(recv_list); free(new_external); #endif } void commExchange(Comm* c, Matrix* A, double* x) { #ifdef _MPI int num_external = 0; // Extract Matrix pieces int local_nrow = A->nr; int num_neighbors = c->num_send_neighbors; int* recv_length = c->recv_length; int* send_length = c->send_length; int* neighbors = c->neighbors; double* send_buffer = c->send_buffer; int total_to_be_sent = c->total_to_be_sent; int* elements_to_send = c->elements_to_send; int rank = c->rank; int size = c->size; MPI_Comm comm = c->comm; // first post receives, these are immediate receives // Do not wait for result to come, will do that at the // wait call below. int MPI_MY_TAG = 99; MPI_Request request[num_neighbors]; // Externals are at end of locals double* x_external = (double*)x + local_nrow; // Post receives first for (int i = 0; i < num_neighbors; i++) { int n_recv = recv_length[i]; MPI_Irecv(x_external, n_recv, MPI_DOUBLE, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD, request + i); x_external += n_recv; } // Fill up send buffer for (int i = 0; i < total_to_be_sent; i++) { send_buffer[i] = x[elements_to_send[i]]; } // Send to each neighbor for (int i = 0; i < num_neighbors; i++) { int n_send = send_length[i]; MPI_Send(send_buffer, n_send, MPI_DOUBLE, neighbors[i], MPI_MY_TAG, MPI_COMM_WORLD); send_buffer += n_send; } // Complete the reads issued above MPI_Status status; for (int i = 0; i < num_neighbors; i++) { if (MPI_Wait(request + i, &status)) { printf("MPI_Wait error\n"); exit(EXIT_FAILURE); } } #endif } void commPrintConfig(Comm* c) { #ifdef _MPI fflush(stdout); MPI_Barrier(MPI_COMM_WORLD); if (commIsMaster(c)) { printf("Communication setup:\n"); } for (int i = 0; i < c->size; i++) { if (i == c->rank) { fflush(stdout); } MPI_Barrier(MPI_COMM_WORLD); } #endif } void commInit(Comm* c, int argc, char** argv) { #ifdef _MPI MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &(c->rank)); MPI_Comm_size(MPI_COMM_WORLD, &(c->size)); #else c->rank = 0; c->size = 1; #endif } void commFinalize(Comm* c) { #ifdef _MPI MPI_Finalize(); #endif }