diff --git a/.gitignore b/.gitignore index fbdfa9d3..87400105 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ heffte/ docs_doxygen/ docs_sphinx/ tutorial/getting_started/Example0/build_* -tutorial/getting_started/Example0/install* \ No newline at end of file +tutorial/getting_started/Example0/install* +examples/mesh_decomp/lib/* \ No newline at end of file diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 372ad21c..0a548973 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.1.3) +cmake_minimum_required(VERSION 3.5) project (matarbenchmark) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index affcd031..4c379334 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -156,6 +156,9 @@ if (KOKKOS) if (MPI) include_directories(laplaceMPI) add_subdirectory(laplaceMPI) + + include_directories(mesh_decomp) + add_subdirectory(mesh_decomp) endif() endif() @@ -191,11 +194,12 @@ add_subdirectory(sparsetests) include_directories(test_rocm) add_subdirectory(test_rocm) -#include_directories(phaseField/srcKokkosVerbose) -#add_subdirectory(phaseField/srcKokkosVerbose) -#include_directories(phaseField/srcMacros) -#add_subdirectory(phaseField/srcMacros) +# include_directories(phaseField/srcKokkosVerbose) +# add_subdirectory(phaseField/srcKokkosVerbose) + +# include_directories(phaseField/srcMacros) +# add_subdirectory(phaseField/srcMacros) -#include_directories(phaseFieldMPI) -#add_subdirectory(phaseFieldMPI) +# include_directories(phaseFieldMPI) +# add_subdirectory(phaseFieldMPI) diff --git a/examples/gArrayofgArrays/CMakeLists.txt b/examples/gArrayofgArrays/CMakeLists.txt index 33a5fa97..e90dd1da 100644 --- a/examples/gArrayofgArrays/CMakeLists.txt +++ b/examples/gArrayofgArrays/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.1.3) +cmake_minimum_required(VERSION 3.5) find_package(Matar REQUIRED) diff --git a/examples/halfspace_cooling/CMakeLists.txt b/examples/halfspace_cooling/CMakeLists.txt index dbcaa6f9..91bffb75 100644 --- a/examples/halfspace_cooling/CMakeLists.txt +++ b/examples/halfspace_cooling/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.1.3) +cmake_minimum_required(VERSION 3.5) find_package(Matar REQUIRED) diff --git a/examples/laplace/CMakeLists.txt b/examples/laplace/CMakeLists.txt index acbd4a1f..b3122cd0 100644 --- a/examples/laplace/CMakeLists.txt +++ b/examples/laplace/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.1.3) +cmake_minimum_required(VERSION 3.5) find_package(Matar REQUIRED) diff --git a/examples/laplaceMPI/CMakeLists.txt b/examples/laplaceMPI/CMakeLists.txt index 5b114927..d722fac9 100644 --- a/examples/laplaceMPI/CMakeLists.txt +++ b/examples/laplaceMPI/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.1.3) +cmake_minimum_required(VERSION 3.5) if (KOKKOS) #find_package(Kokkos REQUIRED) #new diff --git a/examples/mesh_decomp/CMakeLists.txt b/examples/mesh_decomp/CMakeLists.txt new file mode 100644 index 00000000..6c8901da --- /dev/null +++ b/examples/mesh_decomp/CMakeLists.txt @@ -0,0 +1,45 @@ +cmake_minimum_required(VERSION 3.5) + +# Find MPI +find_package(MPI REQUIRED) +add_definitions(-DHAVE_MPI=1) + +find_package(Matar REQUIRED) + +execute_process( + COMMAND ${CMAKE_CURRENT_LIST_DIR}/install_ptscotch.sh + WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR} + RESULT_VARIABLE INSTALL_PTSCOTCH_RESULT +) + +if(NOT INSTALL_PTSCOTCH_RESULT EQUAL 0) + message(FATAL_ERROR "Failed to install PT-Scotch by running install_ptscotch.sh") +endif() + + +if (KOKKOS) + #find_package(Kokkos REQUIRED) #new + + add_executable(mesh_decomp mesh_decomp.cpp) + + add_definitions(-DHAVE_KOKKOS=1) + + # Add include directories for MPI and Scotch/PT-Scotch + target_include_directories(mesh_decomp PRIVATE ${MPI_CXX_INCLUDE_PATH} ${CMAKE_CURRENT_LIST_DIR}/lib/scotch/build/src/include) + + # Link libraries - order matters! libptscotch depends on libscotch + # Use -Wl,--whole-archive to ensure all symbols are included from static libraries + # Note: Only link libptscotcherr.a (not libscotcherr.a) to avoid multiple definitions + target_link_libraries(mesh_decomp ${LINKING_LIBRARIES} MPI::MPI_CXX + -Wl,--whole-archive + ${CMAKE_CURRENT_LIST_DIR}/lib/scotch/build/lib/libscotch.a + -Wl,--no-whole-archive + -Wl,--whole-archive + ${CMAKE_CURRENT_LIST_DIR}/lib/scotch/build/lib/libptscotcherr.a + ${CMAKE_CURRENT_LIST_DIR}/lib/scotch/build/lib/libptscotch.a + -Wl,--no-whole-archive + -lz # zlib for gzip compression + -lbz2 # bzip2 library + -llzma # xz compression library + ) +endif() diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h new file mode 100644 index 00000000..24c75d46 --- /dev/null +++ b/examples/mesh_decomp/decomp_utils.h @@ -0,0 +1,2440 @@ +#ifndef DECOMP_UTILS_H +#define DECOMP_UTILS_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "mesh.h" +#include "state.h" +#include "mesh_io.h" +#include "communication_plan.h" + + +// Include Scotch headers +#include "scotch.h" +#include "ptscotch.h" + +/** + * @brief Partitions the input mesh into a naive element-based decomposition across MPI ranks. + * + * This function splits the input mesh (and its associated node information) evenly among the given number of MPI ranks. + * It assigns contiguous blocks of elements (and the corresponding nodes and nodal data) to each rank. + * + * The function constructs: + * - The sub-mesh (naive_mesh) and its nodes (naive_node) for the local rank. + * - Maps and vectors indicating elements and nodes present on each rank. + * - Auxiliary arrays (elems_in_elem_on_rank, num_elems_in_elem_per_rank) for local element connectivity and neighbor look-ups. + * + * The decomposition is "naive" in that it uses a simple contiguous block assignment, without regard to mesh topology or quality of partitioning. + * This function is generally used as the preliminary step before repartitioning with tools like PT-Scotch or for algorithm prototyping. + * + * @param initial_mesh[in] The input mesh containing all elements/nodes on rank 0. + * @param initial_node[in] The nodal data for the input mesh on rank 0. + * @param naive_mesh[out] The mesh on this rank after naive partitioning. + * @param naive_node[out] The nodal data on this rank after naive partitioning. + * @param elems_in_elem_on_rank[out] Vector of element-to-element connectivity for this rank's local mesh. + * @param num_elems_in_elem_per_rank[out] Vector of counts for element neighbors for each local element. + * @param world_size[in] Number of MPI ranks (world size). + * @param rank[in] This MPI rank's id. + */ + +void naive_partition_mesh( + Mesh_t& initial_mesh, + node_t& initial_node, + Mesh_t& naive_mesh, + node_t& naive_node, + CArrayDual& elems_in_elem_on_rank, + CArrayDual& num_elems_in_elem_per_rank, + int world_size, + int rank) +{ + + bool print_info = false; + + int num_elements_on_rank = 0; + int num_nodes_on_rank = 0; + int num_nodes_per_elem = 0; + int num_dim = initial_mesh.num_dims; + + + // Compute the number of elements to send to each rank and num_nodes_per_elem + std::vector elems_per_rank(world_size); // number of elements to send to each rank size(world_size) + if (rank == 0) { + + num_nodes_per_elem = initial_mesh.num_nodes_in_elem; + + // Compute elements to send to each rank; handle remainders for non-even distribution + std::fill(elems_per_rank.begin(), elems_per_rank.end(), initial_mesh.num_elems / world_size); + int remainder = initial_mesh.num_elems % world_size; + for (int i = 0; i < remainder; i++) { + elems_per_rank[i] += 1; + } + } + + // Broadcasts the value of num_nodes_per_elem from the root rank (0) to all other ranks in MPI_COMM_WORLD. + // After this call, all ranks will have the same value for num_nodes_per_elem. + MPI_Bcast(&num_nodes_per_elem, 1, MPI_INT, 0, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + + // ******************************************************** + // Scatter the number of elements to each rank + // ******************************************************** + // All ranks participate in the scatter operation + // MPI_Scatter signature: + // MPI_Scatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype, + // void *recvbuf, int recvcount, MPI_Datatype recvtype, + // int root, MPI_Comm comm) + + MPI_Barrier(MPI_COMM_WORLD); + + MPI_Scatter(elems_per_rank.data(), 1, MPI_INT, + &num_elements_on_rank, 1, MPI_INT, + 0, MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); + + + // Vector of element to send to each rank using a naive partitioning (0-m, m-n, n-o, etc.) + std::vector elements_on_rank(num_elements_on_rank); + + + // ******************************************************** + // Scatter the actual element global ids to each rank + // ******************************************************** + + // create a 2D vector of elements to send to each rank + std::vector> elements_to_send(world_size); + if (rank == 0) { + + // Populate the elements_to_send array by finding all elements in the elements_per_rank array and adding them to the elements_to_send array + int elem_gid = 0; + for (int rank = 0; rank < world_size; rank++) { + for (int j = 0; j < elems_per_rank[rank]; j++) { + elements_to_send[rank].push_back(elem_gid); + elem_gid++; + } + } + + // Prepare data for MPI_Scatterv (scatter with variable counts) + // Flatten the 2D elements_to_send into a 1D array + std::vector all_elements; // array of all elements to be sent to each rank + std::vector sendcounts(world_size); // array of the number of elements to send to each rank + std::vector displs(world_size); // array of the displacement for each rank in the flattened array + + int displacement = 0; // displacement is the starting index of the elements for the current rank in the flattened array + for (int i = 0; i < world_size; i++) { + sendcounts[i] = elems_per_rank[i]; // number of elements to send to each rank + displs[i] = displacement; // displacement for each rank in the flattened array + // Copy elements for rank i to the flattened array + for (int j = 0; j < elems_per_rank[i]; j++) { + all_elements.push_back(elements_to_send[i][j]); // add the elements to the flattened array + } + displacement += elems_per_rank[i]; // increment the displacement by the number of elements to send to the next rank + } + + // Send the elements to each rank + // all_elements.data(): Pointer to the flattened array of all elements to be sent to each rank + // sendcounts.data(): Array with the number of elements to send to each rank + // displs.data(): Array with the displacement for each rank in the flattened array + // MPI_INT: Data type of the elements (integer) + // elements_on_rank.data(): Pointer to the buffer where each rank will receive its elements + // num_elements_on_rank: Number of elements that the receiving rank expects to receive + // MPI_INT: Data type of the receive buffer (integer) + // 0: The root rank (rank 0) that is performing the scatter + // MPI_COMM_WORLD: The communicator + MPI_Scatterv(all_elements.data(), sendcounts.data(), displs.data(), MPI_INT, + elements_on_rank.data(), num_elements_on_rank, MPI_INT, + 0, MPI_COMM_WORLD); + } + else { + // If the rank is not the root rank, it will receive nullptr for the sendbuf, sendcounts, and displs arrays + MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT, + elements_on_rank.data(), num_elements_on_rank, MPI_INT, + 0, MPI_COMM_WORLD); + } + + // Wait for all ranks to complete the scatter operation + MPI_Barrier(MPI_COMM_WORLD); + + // ****************************************************************************************** + // Scatter the number of nodes to each rank and compute which nodes to send to each rank + // ****************************************************************************************** + std::vector nodes_per_rank(world_size); // number of nodes to send to each rank size(world_size) + std::vector nodes_on_rank; // node gids the current rank + std::vector> nodes_to_send(world_size); // nodes to send to each rank + + if (rank == 0) { + + // Populate the nodes_to_send array by finding all nodes in the elements in elements_to_send and removing duplicates + for (int i = 0; i < world_size; i++) { + std::set nodes_set; + for (int j = 0; j < elems_per_rank[i]; j++) { + for (int k = 0; k < num_nodes_per_elem; k++) { + nodes_set.insert(initial_mesh.nodes_in_elem.host(elements_to_send[i][j], k)); + } + } + nodes_to_send[i] = std::vector(nodes_set.begin(), nodes_set.end()); + } + + for (int i = 0; i < world_size; i++) { + nodes_per_rank[i] = nodes_to_send[i].size(); + } + } + + // Send the number of nodes to each rank using MPI_scatter + MPI_Scatter(nodes_per_rank.data(), 1, MPI_INT, &num_nodes_on_rank, 1, MPI_INT, 0, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + + // resize the nodes_on_rank vector to hold the received data + nodes_on_rank.resize(num_nodes_on_rank); + + MPI_Barrier(MPI_COMM_WORLD); + + // ****************************************************************************************** + // Scatter the actual node global ids to each rank + // ****************************************************************************************** + if (rank == 0) { + + // Prepare data for MPI_Scatterv (scatter with variable counts) + // Flatten the 2D nodes_to_send into a 1D array + std::vector all_nodes; + std::vector sendcounts(world_size); + std::vector displs(world_size); + + int displacement = 0; + for (int i = 0; i < world_size; i++) { + sendcounts[i] = nodes_to_send[i].size(); + displs[i] = displacement; + // Copy nodes for rank i to the flattened array + for (int j = 0; j < nodes_to_send[i].size(); j++) { + all_nodes.push_back(nodes_to_send[i][j]); + } + displacement += nodes_to_send[i].size(); + } + // Send the nodes to each rank + // all_nodes.data(): Pointer to the flattened array of all nodes to be sent to each rank + // sendcounts.data(): Array with the number of nodes to send to each rank + // displs.data(): Array with the displacement for each rank in the flattened array + // MPI_INT: Data type of the nodes (integer) + // nodes_on_rank.data(): Pointer to the buffer where each rank will receive its nodes + // num_nodes_on_rank: Number of nodes that the receiving rank expects to receive + // MPI_INT: Data type of the receive buffer (integer) + // 0: The root rank (rank 0) that is performing the scatter + // MPI_COMM_WORLD: The communicator + MPI_Scatterv(all_nodes.data(), sendcounts.data(), displs.data(), MPI_INT, + nodes_on_rank.data(), num_nodes_on_rank, MPI_INT, + 0, MPI_COMM_WORLD); + } + else { + MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT, + nodes_on_rank.data(), num_nodes_on_rank, MPI_INT, + 0, MPI_COMM_WORLD); + } + + MPI_Barrier(MPI_COMM_WORLD); + + // ****************************************************************************************** + // Scatter the node positions to each rank + // ****************************************************************************************** + // Create a flat 1D vector for node positions (num_dim coordinates per node) + std::vector node_pos_on_rank_flat(num_nodes_on_rank * num_dim); + CArrayDual node_pos_on_rank(num_nodes_on_rank, num_dim, "node_pos_on_rank_decomp"); + + if(rank == 0){ + + // Prepare data for MPI_Scatterv (scatter with variable counts) + // Flatten the 2D node_pos_to_send into a 1D array + std::vector all_node_pos; + std::vector sendcounts(world_size); + std::vector displs(world_size); + + int displacement = 0; + for (int i = 0; i < world_size; i++) { + sendcounts[i] = nodes_to_send[i].size() * num_dim; + displs[i] = displacement; // displacement is the starting index of the nodes for the current rank in the flattened array + // Copy node positions for rank i to the flattened array + for(int node_gid = 0; node_gid < nodes_to_send[i].size(); node_gid++) { + for(int dim = 0; dim < num_dim; dim++) { + all_node_pos.push_back(initial_node.coords.host(nodes_to_send[i][node_gid], dim)); + } + } + displacement += nodes_to_send[i].size() * num_dim; + } + + // Send the node positions to each rank + MPI_Scatterv(all_node_pos.data(), sendcounts.data(), displs.data(), MPI_DOUBLE, + node_pos_on_rank.host_pointer(), num_nodes_on_rank * num_dim, MPI_DOUBLE, + 0, MPI_COMM_WORLD); + } + else { + MPI_Scatterv(nullptr, nullptr, nullptr, MPI_DOUBLE, + node_pos_on_rank.host_pointer(), num_nodes_on_rank * num_dim, MPI_DOUBLE, + 0, MPI_COMM_WORLD); + } + + MPI_Barrier(MPI_COMM_WORLD); + node_pos_on_rank.update_device(); + + // ****************************************************************************************** + // Initialize the node state variables + // ****************************************************************************************** + + // initialize node state variables, for now, we just need coordinates, the rest will be initialize by the respective solvers + std::vector required_node_state = { node_state::coords }; + naive_node.initialize(num_nodes_on_rank, num_dim, required_node_state); + + FOR_ALL(node_id, 0, num_nodes_on_rank, + dim, 0, num_dim,{ + naive_node.coords(node_id, dim) = node_pos_on_rank(node_id, dim); + }); + MATAR_FENCE(); + + naive_node.coords.update_host(); + + // ****************************************************************************************** + // Send the element-node connectivity data from the initial mesh to each rank + // ****************************************************************************************** + + // Send the element-node connectivity data from the initial mesh to each rank + std::vector nodes_in_elem_on_rank(num_elements_on_rank * num_nodes_per_elem); + + MPI_Barrier(MPI_COMM_WORLD); + + + // Instead of staging a full copy of the connectivity data per-rank, compute the + // scatter counts/displacements directly from the contiguous global array. + std::vector conn_sendcounts(world_size); + std::vector conn_displs(world_size); + int conn_displacement = 0; + for (int i = 0; i < world_size; i++) { + conn_sendcounts[i] = elems_per_rank[i] * num_nodes_per_elem; + conn_displs[i] = conn_displacement; + conn_displacement += conn_sendcounts[i]; + } + + // Scatter using the native storage type (size_t) and then convert locally to int + size_t* global_nodes_in_elem = nullptr; + if (rank == 0) { + global_nodes_in_elem = initial_mesh.nodes_in_elem.host_pointer(); + } + MPI_Barrier(MPI_COMM_WORLD); + + + { //scope to free memory for tmp vector + std::vector nodes_in_elem_on_rank_size_t(num_elements_on_rank * num_nodes_per_elem); + + MPI_Scatterv(global_nodes_in_elem, conn_sendcounts.data(), conn_displs.data(), MPI_UNSIGNED_LONG_LONG, + nodes_in_elem_on_rank_size_t.data(), nodes_in_elem_on_rank_size_t.size(), MPI_UNSIGNED_LONG_LONG, + 0, MPI_COMM_WORLD); + + for (size_t idx = 0; idx < nodes_in_elem_on_rank_size_t.size(); ++idx) { + nodes_in_elem_on_rank[idx] = static_cast(nodes_in_elem_on_rank_size_t[idx]); + } + } + + // ****************************************************************************************** + // Send the element-element connectivity data from the initial mesh to each rank + // ****************************************************************************************** + + // First, rank 0 computes how many connectivity entries each rank will receive + // and scatters that information + int total_elem_elem_entries = 0; + + std::vector elem_elem_counts(world_size); + + if (rank == 0){ + + DCArrayKokkos tmp_num_elems_in_elem(initial_mesh.num_elems, "tmp_elems_in_elem"); + FOR_ALL(i, 0, initial_mesh.num_elems, { + tmp_num_elems_in_elem(i) = initial_mesh.num_elems_in_elem(i); + }); + tmp_num_elems_in_elem.update_host(); + MATAR_FENCE(); + // Calculate total number of connectivity entries for each rank + for(int i = 0; i < world_size; i++) { + elem_elem_counts[i] = 0; + for(int k = 0; k < elements_to_send[i].size(); k++) { + elem_elem_counts[i] += tmp_num_elems_in_elem.host(elements_to_send[i][k]); + } + } + } + + // Define total_elem_elem_entries to be the sum of the elem_elem_counts + // Scatter the counts to each rank + MPI_Scatter(elem_elem_counts.data(), 1, MPI_INT, + &total_elem_elem_entries, 1, MPI_INT, + 0, MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); + if (rank == 0) std::cout<< " Finished scatter" <(total_elem_elem_entries, "elems_in_elem_on_rank"); + + // Now scatter the num_elems_in_elem for each element on each rank + num_elems_in_elem_per_rank = CArrayDual(num_elements_on_rank, "num_elems_in_elem_per_rank"); + + if (rank == 0) { + std::vector all_num_elems_in_elem; + std::vector displs_ee(world_size); + int displacement = 0; + + DCArrayKokkos tmp_num_elems_in_elem(initial_mesh.num_elems, "tmp_elems_in_elem"); + FOR_ALL(i, 0, initial_mesh.num_elems, { + tmp_num_elems_in_elem(i) = initial_mesh.num_elems_in_elem(i); + }); + tmp_num_elems_in_elem.update_host(); + MATAR_FENCE(); + + for(int i = 0; i < world_size; i++) { + displs_ee[i] = displacement; + + std::cout<< "Rank = "<< i < all_elems_in_elem; + std::vector sendcounts(world_size); + std::vector displs(world_size); + + int displacement = 0; + + DRaggedRightArrayKokkos tmp_elems_in_elem(initial_mesh.num_elems_in_elem, "temp_elem_in_elem"); + + FOR_ALL(elem_gid, 0, initial_mesh.num_elems, { + for (size_t i = 0; i < initial_mesh.num_elems_in_elem(elem_gid); i++) { + tmp_elems_in_elem(elem_gid, i) = initial_mesh.elems_in_elem(elem_gid, i); + } // end for i + }); // end FOR_ALL elems + MATAR_FENCE(); + tmp_elems_in_elem.update_host(); + + + + DCArrayKokkos tmp_num_elems_in_elem(initial_mesh.num_elems, "tmp_elems_in_elem"); + FOR_ALL(i, 0, initial_mesh.num_elems, { + tmp_num_elems_in_elem(i) = initial_mesh.num_elems_in_elem(i); + }); + MATAR_FENCE(); + tmp_num_elems_in_elem.update_host(); + + + for(int i = 0; i < world_size; i++) { + sendcounts[i] = elem_elem_counts[i]; + displs[i] = displacement; + + // Copy element-element connectivity for rank i + for(int k = 0; k < elements_to_send[i].size(); k++) { + for(int l = 0; l < tmp_num_elems_in_elem.host(elements_to_send[i][k]); l++) { + all_elems_in_elem.push_back(tmp_elems_in_elem.host(elements_to_send[i][k], l)); + } + } + displacement += elem_elem_counts[i]; + } + + // Send the element-element connectivity data to each rank using MPI_Scatterv + MPI_Scatterv(all_elems_in_elem.data(), sendcounts.data(), displs.data(), MPI_INT, + elems_in_elem_on_rank.host_pointer(), total_elem_elem_entries, MPI_INT, + 0, MPI_COMM_WORLD); + } + else { + MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT, + elems_in_elem_on_rank.host_pointer(), total_elem_elem_entries, MPI_INT, + 0, MPI_COMM_WORLD); + } + + elems_in_elem_on_rank.update_device(); + + MPI_Barrier(MPI_COMM_WORLD); + + // ****************************************************************************************** + // Initialize the naive_mesh data structures for each rank + // ****************************************************************************************** + naive_mesh.initialize_nodes(num_nodes_on_rank); + naive_mesh.initialize_elems(num_elements_on_rank, num_dim); + + naive_mesh.local_to_global_node_mapping = DCArrayKokkos(num_nodes_on_rank, "naive_mesh.local_to_global_node_mapping"); + naive_mesh.local_to_global_elem_mapping = DCArrayKokkos(num_elements_on_rank, "naive_mesh.local_to_global_elem_mapping"); + + for(int i = 0; i < num_nodes_on_rank; i++) { + naive_mesh.local_to_global_node_mapping.host(i) = nodes_on_rank[i]; + } + + for(int i = 0; i < num_elements_on_rank; i++) { + naive_mesh.local_to_global_elem_mapping.host(i) = elements_on_rank[i]; + } + + naive_mesh.local_to_global_node_mapping.update_device(); + naive_mesh.local_to_global_elem_mapping.update_device(); + + MPI_Barrier(MPI_COMM_WORLD); + + // Timer for reverse mapping of element-node connectivity + double t_reverse_map_start = MPI_Wtime(); + + // rebuild the local element-node connectivity using the local node ids + for(int i = 0; i < num_elements_on_rank; i++) { + for(int j = 0; j < num_nodes_per_elem; j++) { + int node_gid = nodes_in_elem_on_rank[i * num_nodes_per_elem + j]; + + int node_lid = -1; + + // Use binary search to find the local node index for node_gid, local_to_global_node_mapping is sorted + int left = 0, right = num_nodes_on_rank - 1; + while (left <= right) { + int mid = left + (right - left) / 2; + size_t mid_gid = naive_mesh.local_to_global_node_mapping.host(mid); + if (node_gid == mid_gid) { + node_lid = mid; + break; + } else if (node_gid < mid_gid) { + right = mid - 1; + } else { + left = mid + 1; + } + } + + naive_mesh.nodes_in_elem.host(i, j) = node_lid; + } + } + + MPI_Barrier(MPI_COMM_WORLD); + + double t_reverse_map_end = MPI_Wtime(); + if(rank == 0 && print_info) { + std::cout<<" Finished reverse mapping of the element-node connectivity from the global node ids to the local node ids"< All ranks receive: [elem_count[0], elem_count[1], elem_count[2], ...] + // Rank 1: elem_count[1] / + // Rank 2: elem_count[2] / + + int num_dim = input_mesh.num_dims; + + int nodes_per_elem = input_mesh.num_nodes_in_elem; + + // MPI_Allgather: Each rank sends its element count, every rank receives + // the count from every other rank. Result: elem_counts[r] = number of + // elements owned by rank r. + std::vector elem_counts(world_size); + MPI_Allgather(&input_mesh.num_elems, 1, MPI_INT, elem_counts.data(), 1, MPI_INT, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); // Synchronize all ranks before proceeding + + // Compute displacements: offset into the global array for each rank's data + // Example: if elem_counts = [100, 150, 120], then + // elem_displs = [0, 100, 250] (where each rank's data starts in all_elem_gids) + std::vector elem_displs(world_size); + int total_elems = 0; + for (int r = 0; r < world_size; r++) { + elem_displs[r] = total_elems; + total_elems += elem_counts[r]; + } + + // MPI_Allgatherv: Gather variable-sized data from all ranks into one array + // Each rank contributes its local_to_global_elem_mapping, which maps + // local element indices to global element GIDs. After this call, + // all_elem_gids contains ALL element GIDs from all ranks, organized by rank. + std::vector all_elem_gids(total_elems); + MPI_Allgatherv(input_mesh.local_to_global_elem_mapping.host_pointer(), input_mesh.num_elems, MPI_UNSIGNED_LONG_LONG, + all_elem_gids.data(), elem_counts.data(), elem_displs.data(), + MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + + // Build a lookup map: element GID -> owning rank + // This allows O(log n) lookups to determine which rank owns any given element. + std::map elem_gid_to_rank; + for (int rank_id = 0; rank_id < world_size; rank_id++) { + for (int i = 0; i < elem_counts[rank_id]; i++) { + size_t gid = all_elem_gids[elem_displs[rank_id] + i]; + elem_gid_to_rank[gid] = rank_id; + } + } + + // ======================================================================== + // STEP 2: Build index sets for local elements and nodes + // ======================================================================== + std::set local_node_gids; + std::map global_to_local_node_mapping; // GID -> local index mapping + for(int node_rid = 0; node_rid < input_mesh.num_nodes; node_rid++) { + size_t node_gid = input_mesh.local_to_global_node_mapping.host(node_rid); + local_node_gids.insert(node_gid); + global_to_local_node_mapping[node_gid] = node_rid; + } + + // Build a set of locally-owned element GIDs for quick lookup + std::set local_elem_gids; + for (int i = 0; i < input_mesh.num_elems; i++) { + local_elem_gids.insert(input_mesh.local_to_global_elem_mapping.host(i)); + } + + // ======================================================================== + // STEP 3: Exchange element-to-node connectivity via MPI_Allgatherv + // ======================================================================== + // Build a flattened connectivity array: pairs of (elem_gid, node_gid) + // Example for 2 elements with 8 nodes each: + // elem_node_conn = [elem0_gid, node0, elem0_gid, node1, ..., elem1_gid, node0, ...] + // + // This format is chosen because it's easy to serialize and deserialize over MPI, + // and allows us to reconstruct the full element-node relationships. + std::vector elem_node_conn; + int local_conn_size = 0; + + // For each locally-owned element, record its GID and all its node GIDs + for (int lid = 0; lid < input_mesh.num_elems; lid++) { + size_t elem_gid = input_mesh.local_to_global_elem_mapping.host(lid); + + // Access nodes_in_elem[lid][*] to get all nodes in this element + for (int j = 0; j < input_mesh.num_nodes_in_elem; j++) { + size_t node_lid = input_mesh.nodes_in_elem.host(lid, j); // Local index + size_t node_gid = input_mesh.local_to_global_node_mapping.host(node_lid); // Global index + + elem_node_conn.push_back(elem_gid); + elem_node_conn.push_back(node_gid); + } + local_conn_size += nodes_per_elem * 2; // Each element contributes (num_nodes_in_elem * 2) size_ts + } + + + + // ======================================================================== + // Perform MPI communication to gather connectivity from all ranks + // ======================================================================== + // Similar to Step 1, we use MPI_Allgatherv to collect all element-node + // connectivity pairs. This is a two-stage process: + // 1) Gather the size of each rank's connectivity data + // 2) Gather the actual connectivity data with proper offsets + + // Stage 1: Gather connectivity sizes from each rank + // conn_sizes[r] = number of size_t values that rank r will send + std::vector conn_sizes(world_size); + MPI_Allgather(&local_conn_size, 1, MPI_INT, conn_sizes.data(), 1, MPI_INT, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + + // Compute displacements for the second MPI_Allgatherv call + // Displcements tell each rank where its data should be placed in the global array + std::vector conn_displs(world_size); + int total_conn = 0; + for (int r = 0; r < world_size; r++) { + conn_displs[r] = total_conn; + total_conn += conn_sizes[r]; + } + + // Stage 2: Gather all element-node connectivity data + // After this call, all_conn contains the flattened connectivity from every rank, + // organized by rank. Access data from rank r using indices [conn_displs[r], conn_displs[r] + conn_sizes[r]) + std::vector all_conn(total_conn); + MPI_Allgatherv(elem_node_conn.data(), local_conn_size, MPI_UNSIGNED_LONG_LONG, + all_conn.data(), conn_sizes.data(), conn_displs.data(), + MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + + // ======================================================================== + // STEP 4: Identify ghost elements + // ======================================================================== + // A ghost element is an element owned by another rank that shares at least + // one node with our locally-owned elements. This step identifies all such elements. + + + // We use a set to eliminate duplicates (same ghost element might share multiple nodes with us) + std::set ghost_elem_gids; + std::set ghost_node_gids; + + std::map ghost_node_recv_rank; + + // Iterate through connectivity data from each rank (except ourselves) + for (int r = 0; r < world_size; r++) { + if (r == rank) continue; // Skip our own data - we already know our elements + + // Parse the connectivity data for rank r + // Data format: [elem0_gid, node0, elem0_gid, node1, ..., elem1_gid, node0, ...] + // Each pair is 2 size_ts, so num_pairs = conn_sizes[r] / 2 + int num_pairs = conn_sizes[r] / 2; + + for (int i = 0; i < num_pairs; i++) { + // Offset into all_conn for this pair (elem_gid, node_gid) + int offset = conn_displs[r] + i * 2; + size_t elem_gid = all_conn[offset]; + size_t node_gid = all_conn[offset + 1]; + + // Check if this node belongs to one of our locally-owned elements + if (local_node_gids.find(node_gid) != local_node_gids.end()) { + + // Check if this element is NOT owned by us (i.e., it's from another rank) + if (local_elem_gids.find(elem_gid) == local_elem_gids.end()) { + // This is a ghost element for us + ghost_elem_gids.insert(elem_gid); + } + } + } + } + MPI_Barrier(MPI_COMM_WORLD); + + std::map> ghost_nodes_from_ranks; + + // Iterate through connectivity data from each rank (except ourselves) + for (int r = 0; r < world_size; r++) { + if (r == rank) continue; // Skip our own data - we already know our elements + + // Parse the connectivity data for rank r + // Data format: [elem0_gid, node0, elem0_gid, node1, ..., elem1_gid, node0, ...] + // Each pair is 2 size_ts, so num_pairs = conn_sizes[r] / 2 + int num_pairs = conn_sizes[r] / 2; + + for (int i = 0; i < num_pairs; i++) { + // Offset into all_conn for this pair (elem_gid, node_gid) + int offset = conn_displs[r] + i * 2; + size_t elem_gid = all_conn[offset]; + size_t node_gid = all_conn[offset + 1]; + + // Check if this element belongs to one of our ghost elements + if (ghost_elem_gids.find(elem_gid) != ghost_elem_gids.end()) { + + // Check if this node is NOT owned by us (i.e., it's from another rank) + if (local_node_gids.find(node_gid) == local_node_gids.end()) { + // This is a ghost node for us + ghost_node_gids.insert(node_gid); + ghost_node_recv_rank[node_gid] = r; + ghost_nodes_from_ranks[r].insert(node_gid); + } + } + } + } + + std::set shared_nodes; // nodes on MPI rank boundaries + // Iterate through connectivity data from each rank (except ourselves) to find shared nodes + for (int r = 0; r < world_size; r++) { + MPI_Barrier(MPI_COMM_WORLD); + if (r == rank) continue; // Skip our own data - we already know our elements + + // Parse the connectivity data for rank r + // Data format: [elem0_gid, node0, elem0_gid, node1, ..., elem1_gid, node0, ...] + // Each pair is 2 size_ts, so num_pairs = conn_sizes[r] / 2 + int num_pairs = conn_sizes[r] / 2; + + for (int i = 0; i < num_pairs; i++) { + // Offset into all_conn for this pair (elem_gid, node_gid) + int offset = conn_displs[r] + i * 2; + size_t elem_gid = all_conn[offset]; + size_t node_gid = all_conn[offset + 1]; + + // Check if this element belongs to one of our ghost elements + if (ghost_elem_gids.find(elem_gid) != ghost_elem_gids.end()) { + // If another rank references a node that is also owned by us, it is a shared node + if (local_node_gids.find(node_gid) != local_node_gids.end()) { + shared_nodes.insert(node_gid); + + } + } + } + } + + // Create a vecor of the ranks that this rank will receive data from for ghost nodes + std::set ghost_node_receive_ranks; + for (const auto& pair : ghost_node_recv_rank) { + ghost_node_receive_ranks.insert(pair.second); + } + + std::vector ghost_node_receive_ranks_vec(ghost_node_receive_ranks.begin(), ghost_node_receive_ranks.end()); + + + // Find which nodes *we own* are ghosted on other ranks, and on which ranks + // We want: for each of our local nodes, the list of ranks that ghost it + + // Map: local_node_gid -> set of remote ranks that ghost this node + std::map> local_node_gid_to_ghosting_ranks; + + std::vector> shared_nodes_on_ranks(world_size); + + // Iterate through connectivity from all ranks except ourselves + for (int r = 0; r < world_size; r++) { + if (r == rank) continue; // skip our own rank + + int num_pairs = conn_sizes[r] / 2; + for (int i = 0; i < num_pairs; i++) { + int offset = conn_displs[r] + i * 2; + size_t elem_gid = all_conn[offset]; + size_t node_gid = all_conn[offset + 1]; + + // If this node is owned by us, and remote rank references it, they are ghosting it + if (local_node_gids.find(node_gid) != local_node_gids.end()) { + local_node_gid_to_ghosting_ranks[node_gid].insert(r); + shared_nodes_on_ranks[r].insert(node_gid); + } + } + } + + // Use the map to create a vector of the ranks that this rank will receive data from for ghost nodes + std::set ghost_node_send_ranks; + for (const auto& pair : local_node_gid_to_ghosting_ranks) { + ghost_node_send_ranks.insert(pair.second.begin(), pair.second.end()); + } + std::vector ghost_node_send_ranks_vec(ghost_node_send_ranks.begin(), ghost_node_send_ranks.end()); + + // Store the count of ghost elements for later use + input_mesh.num_ghost_elems = ghost_elem_gids.size(); + input_mesh.num_ghost_nodes = ghost_node_gids.size(); + MPI_Barrier(MPI_COMM_WORLD); + + + // ======================================================================== + // STEP 5: Extract ghost element connectivity + // ======================================================================== + // Now that we know which elements are ghosts, we need to extract their + // full node connectivity from all_conn. This allows us to properly construct + // the extended mesh with ghost elements included. + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout << " Starting to build extended mesh with ghost elements" << std::endl; + + // Build a map: ghost_elem_gid -> vector of node_gids + // We pre-allocate the vector size to avoid repeated reallocations + std::map> ghost_elem_to_nodes; + for (const size_t& ghost_gid : ghost_elem_gids) { + ghost_elem_to_nodes[ghost_gid].reserve(input_mesh.num_nodes_in_elem); + } + + // ======================================================================== + // Extract nodes for each ghost element from the globally-collected all_conn + // ======================================================================== + // The all_conn array was populated by MPI_Allgatherv and contains connectivity + // pairs (elem_gid, node_gid) for all elements from all ranks. We now parse + // this data to extract the nodes for each ghost element. + for (int r = 0; r < world_size; r++) { + if (r == rank) continue; // Skip our own data - we already have owned element connectivity + + // Parse connectivity data for rank r + int num_pairs = conn_sizes[r] / 2; + + for (int i = 0; i < num_pairs; i++) { + // Calculate offset for this pair: displacement + (pair_index * 2) + int offset = conn_displs[r] + i * 2; + size_t elem_gid = all_conn[offset]; + size_t node_gid = all_conn[offset + 1]; + + // If this element is one of our identified ghost elements, record its node + auto it = ghost_elem_to_nodes.find(elem_gid); + if (it != ghost_elem_to_nodes.end()) { + it->second.push_back(node_gid); + } + } + } + + // ======================================================================== + // Validation: Verify each ghost element has the correct number of nodes + // ======================================================================== + // This catch detects issues in the MPI communication or parsing logic + for (auto& pair : ghost_elem_to_nodes) { + if (pair.second.size() != static_cast(input_mesh.num_nodes_in_elem)) { + std::cerr << "[rank " << rank << "] ERROR: Ghost element " << pair.first + << " has " << pair.second.size() << " nodes, expected " << input_mesh.num_nodes_in_elem << std::endl; + } + } + + // Step 2: Build extended node list (owned nodes first, then ghost-only nodes) + // Start with owned nodes + std::map node_gid_to_extended_lid; + int extended_node_lid = 0; + + // Add all owned nodes + for (int i = 0; i < input_mesh.num_nodes; i++) { + size_t node_gid = input_mesh.local_to_global_node_mapping.host(i); + node_gid_to_extended_lid[node_gid] = extended_node_lid++; + } + + // Add ghost-only nodes (nodes that belong to ghost elements but not to owned elements) + std::set ghost_only_nodes; + for (const auto& pair : ghost_elem_to_nodes) { + for (size_t node_gid : pair.second) { + // Check if we already have this node + if (node_gid_to_extended_lid.find(node_gid) == node_gid_to_extended_lid.end()) { + ghost_only_nodes.insert(node_gid); + } + } + } + + // Assign extended local IDs to ghost-only nodes + for (size_t node_gid : ghost_only_nodes) { + node_gid_to_extended_lid[node_gid] = extended_node_lid++; + } + + int total_extended_nodes = extended_node_lid; + + MPI_Barrier(MPI_COMM_WORLD); + + // Step 3: Prepare requests for ghost node coordinates from owning ranks (if needed later) + // Build request list: for each ghost node, find an owning rank via any ghost element that contains it + std::map> rank_to_ghost_node_requests; + for (size_t node_gid : ghost_only_nodes) { + // Find which rank owns an element containing this node + // Look through ghost elements + for (const auto& pair : ghost_elem_to_nodes) { + size_t ghost_elem_gid = pair.first; + const std::vector& nodes = pair.second; + bool found = false; + for (size_t ngid : nodes) { + if (ngid == node_gid) { + found = true; + break; + } + } + if (found) { + auto owner_it = elem_gid_to_rank.find(ghost_elem_gid); + if (owner_it != elem_gid_to_rank.end()) { + rank_to_ghost_node_requests[owner_it->second].push_back(node_gid); + break; + } + } + } + } + + // Step 4: Build extended element list and node connectivity + // Owned elements: 0 to num_new_elems-1 (already have these) + // Ghost elements: num_new_elems to num_new_elems + num_ghost_elems - 1 + + // Create extended element-node connectivity array + int total_extended_elems = input_mesh.num_elems + input_mesh.num_ghost_elems; + std::vector> extended_nodes_in_elem(total_extended_elems); + + // Copy owned element connectivity (convert to extended node LIDs) + for (int lid = 0; lid < input_mesh.num_elems; lid++) { + extended_nodes_in_elem[lid].reserve(nodes_per_elem); + for (int j = 0; j < nodes_per_elem; j++) { + size_t node_lid = input_mesh.nodes_in_elem.host(lid, j); + size_t node_gid = input_mesh.local_to_global_node_mapping.host(node_lid); + int ext_lid = node_gid_to_extended_lid[node_gid]; + extended_nodes_in_elem[lid].push_back(ext_lid); + } + } + + // Add ghost element connectivity (map ghost node GIDs to extended node LIDs) + int ghost_elem_ext_lid = input_mesh.num_elems; + std::vector ghost_elem_gids_ordered(ghost_elem_gids.begin(), ghost_elem_gids.end()); + std::sort(ghost_elem_gids_ordered.begin(), ghost_elem_gids_ordered.end()); + + for (size_t ghost_gid : ghost_elem_gids_ordered) { + auto it = ghost_elem_to_nodes.find(ghost_gid); + if (it == ghost_elem_to_nodes.end()) continue; + + extended_nodes_in_elem[ghost_elem_ext_lid].reserve(nodes_per_elem); + for (size_t node_gid : it->second) { + int ext_lid = node_gid_to_extended_lid[node_gid]; + extended_nodes_in_elem[ghost_elem_ext_lid].push_back(ext_lid); + } + ghost_elem_ext_lid++; + } + + MPI_Barrier(MPI_COMM_WORLD); + // Sequential rank-wise printing of extended mesh structure info + if(print_info) { + for (int r = 0; r < world_size; r++) { + MPI_Barrier(MPI_COMM_WORLD); + if (rank == r) { + std::cout << "[rank " << rank << "] Finished building extended mesh structure" << std::endl; + std::cout << "[rank " << rank << "] - Owned elements: " << input_mesh.num_elems << std::endl; + std::cout << "[rank " << rank << "] - Ghost elements: " << ghost_elem_gids.size() << std::endl; + std::cout << "[rank " << rank << "] - Total extended elements: " << total_extended_elems << std::endl; + std::cout << "[rank " << rank << "] - Owned nodes: " << input_mesh.num_nodes << std::endl; + std::cout << "[rank " << rank << "] - Ghost-only nodes: " << ghost_only_nodes.size() << std::endl; + std::cout << "[rank " << rank << "] - Total extended nodes: " << total_extended_nodes << std::endl; + std::cout << std::flush; + } + MPI_Barrier(MPI_COMM_WORLD); + } + } + // The extended_nodes_in_elem vector now contains the connectivity for both owned and ghost elements + // Each element's nodes are stored using extended local node IDs (0-based, contiguous) + + // Build reverse maps: extended_lid -> gid for nodes and elements + std::vector extended_lid_to_node_gid(total_extended_nodes); + for (const auto& pair : node_gid_to_extended_lid) { + extended_lid_to_node_gid[pair.second] = pair.first; + } + + // Build extended element GID list: owned first, then ghost + std::vector extended_lid_to_elem_gid(total_extended_elems); + + // Owned elements + for (int i = 0; i < input_mesh.num_elems; i++) { + extended_lid_to_elem_gid[i] = input_mesh.local_to_global_elem_mapping.host(i); + } + + // Ghost elements (in sorted order) + for (size_t i = 0; i < ghost_elem_gids_ordered.size(); i++) { + extended_lid_to_elem_gid[input_mesh.num_elems + i] = ghost_elem_gids_ordered[i]; + } + + // Build array: for each ghost element, store which rank owns it (where to receive data from) + std::vector ghost_elem_owner_ranks(ghost_elem_gids_ordered.size()); + for (size_t i = 0; i < ghost_elem_gids_ordered.size(); i++) { + size_t ghost_gid = ghost_elem_gids_ordered[i]; + auto it = elem_gid_to_rank.find(ghost_gid); + if (it != elem_gid_to_rank.end()) { + ghost_elem_owner_ranks[i] = it->second; + } else { + std::cerr << "[rank " << rank << "] ERROR: Ghost element GID " << ghost_gid + << " not found in elem_gid_to_rank map!" << std::endl; + ghost_elem_owner_ranks[i] = -1; // Invalid rank as error indicator + } + } + + // Create a std::set of all the ranks this rank will receive data from + std::set ghost_elem_receive_ranks; + for (size_t i = 0; i < ghost_elem_gids_ordered.size(); i++) { + ghost_elem_receive_ranks.insert(ghost_elem_owner_ranks[i]); + } + + // ****************************************************************************************** + // Build the final partitioned mesh + // ****************************************************************************************** + + + output_mesh.initialize_nodes(total_extended_nodes); + output_mesh.initialize_elems(total_extended_elems, 3); + output_mesh.local_to_global_node_mapping = DCArrayKokkos(total_extended_nodes); + output_mesh.local_to_global_elem_mapping = DCArrayKokkos(total_extended_elems); + for (int i = 0; i < total_extended_nodes; i++) { + output_mesh.local_to_global_node_mapping.host(i) = extended_lid_to_node_gid[i]; + } + for (int i = 0; i < total_extended_elems; i++) { + output_mesh.local_to_global_elem_mapping.host(i) = extended_lid_to_elem_gid[i]; + } + output_mesh.local_to_global_node_mapping.update_device(); + output_mesh.local_to_global_elem_mapping.update_device(); + + output_mesh.num_ghost_elems = ghost_elem_gids.size(); + output_mesh.num_ghost_nodes = ghost_only_nodes.size(); + + output_mesh.num_owned_elems = input_mesh.num_elems; + output_mesh.num_owned_nodes = input_mesh.num_nodes; + + MPI_Barrier(MPI_COMM_WORLD); + // rebuild the local element-node connectivity using the local node ids + // extended_nodes_in_elem already contains extended local node IDs, so we can use them directly + for(int i = 0; i < total_extended_elems; i++) { + for(int j = 0; j < nodes_per_elem; j++) { + output_mesh.nodes_in_elem.host(i, j) = extended_nodes_in_elem[i][j]; + } + } + + MPI_Barrier(MPI_COMM_WORLD); + + output_mesh.nodes_in_elem.update_device(); + output_mesh.build_connectivity(); + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout << " Finished building final mesh structure" << std::endl; + + + // ****************************************************************************************** + // Build the final nodes that include ghost + // ****************************************************************************************** + + + output_node.initialize(total_extended_nodes, num_dim, {node_state::coords}, node_communication_plan); + MPI_Barrier(MPI_COMM_WORLD); + + // The goal here is to populate output_node.coords using globally gathered ghost node coordinates, + // since input_node does not contain ghost node coordinates. + // + // Each rank will: + // 1. Gather coordinates of its owned nodes (from input_node). + // 2. Use MPI to gather all coordinates for all required (owned + ghost) global node IDs + // into a structure mapping global ID -> coordinate. + // 3. Use this map to fill output_node.coords. + + // 1. Build list of all global node IDs needed on this rank (owned + ghosts) + std::vector all_needed_node_gids(total_extended_nodes); + for (int i = 0; i < total_extended_nodes; i++) { + all_needed_node_gids[i] = output_mesh.local_to_global_node_mapping.host(i); + } + + // 2. Build owned node GIDs and their coordinates + std::vector owned_gids(output_mesh.num_owned_nodes); + for (int i = 0; i < output_mesh.num_owned_nodes; i++) + owned_gids[i] = output_mesh.local_to_global_node_mapping.host(i); + + // 3. Gather all GIDs in the world that are needed anywhere (owned or ghosted, by any rank) + // so we can distribute the needed coordinate data. + // The easiest is to Allgather everyone's "owned_gids" and coords + + int local_owned_count = static_cast(owned_gids.size()); + std::vector owned_counts(world_size, 0); + if (local_owned_count < 0) local_owned_count = 0; // Clean up possibility of -1 + + // a) Gather counts + owned_counts.resize(world_size, 0); + MPI_Allgather(&local_owned_count, 1, MPI_INT, owned_counts.data(), 1, MPI_INT, MPI_COMM_WORLD); + + // b) Displacements and total + std::vector owned_displs(world_size,0); + int total_owned = 0; + for (int r = 0; r < world_size; r++) { + owned_displs[r] = total_owned; + total_owned += owned_counts[r]; + } + + // c) Global GIDs (size: total_owned) + std::vector all_owned_gids(total_owned); + MPI_Allgatherv(owned_gids.data(), local_owned_count, MPI_UNSIGNED_LONG_LONG, + all_owned_gids.data(), owned_counts.data(), owned_displs.data(), + MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); + + // Map node gid -> owning rank + std::unordered_map node_gid_to_owner_rank; + int owner_offset = 0; + for (int r = 0; r < world_size; r++) { + for (int i = 0; i < owned_counts[r]; i++) { + node_gid_to_owner_rank[all_owned_gids[owner_offset + i]] = r; + } + owner_offset += owned_counts[r]; + } + + + // d) Global coords (size: total_owned x 3) + std::vector owned_coords_send(num_dim*local_owned_count, 0.0); + for (int i = 0; i < local_owned_count; i++) { + for(int dim = 0; dim < num_dim; dim++){ + owned_coords_send[num_dim*i+dim] = input_node.coords.host(i,dim); + } + } + std::vector all_owned_coords(num_dim * total_owned, 0.0); + + // Create coordinate-specific counts and displacements (in units of doubles, not nodes) + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout << " Getting coord_counts" << std::endl; + + std::vector coord_counts(world_size); + std::vector coord_displs(world_size); + for (int r = 0; r < world_size; r++) { + coord_counts[r] = num_dim * owned_counts[r]; // Each node has num_dim doubles + coord_displs[r] = num_dim * owned_displs[r]; // Displacement in doubles + } + + MPI_Allgatherv(owned_coords_send.data(), num_dim*local_owned_count, MPI_DOUBLE, + all_owned_coords.data(), coord_counts.data(), coord_displs.data(), + MPI_DOUBLE, MPI_COMM_WORLD); + + // e) Build map: gid -> coord[3] + std::unordered_map> gid_to_coord; + for (int i = 0; i < total_owned; i++) { + std::vector xyz(num_dim); // size is runtime-dependent + for (int dim = 0; dim < num_dim; dim++) { + xyz[dim] = all_owned_coords[num_dim * i + dim]; + } + gid_to_coord[all_owned_gids[i]] = std::move(xyz); + } + + // 4. Finally, fill output_node.coords with correct coordinates. + for (int i = 0; i < total_extended_nodes; i++) { + size_t gid = output_mesh.local_to_global_node_mapping.host(i); + auto it = gid_to_coord.find(gid); + if (it != gid_to_coord.end()) { + for (int dim = 0; dim < num_dim; dim++) { + output_node.coords.host(i,dim) = it->second[dim]; + } + } else { + // Could happen if there's a bug: fill with zeros for safety + for (int dim = 0; dim < num_dim; dim++) { + output_node.coords.host(i,dim) = 0.0; + } + } + } + output_node.coords.update_device(); + + + // -------------------------------------------------------------------------------------- + // Build the send patterns for elements + // Build reverse map via global IDs: for each local element gid, find ranks that ghost it. + // Steps: + // 1) Each rank contributes its ghost element GIDs. + // 2) Allgatherv ghost GIDs to build gid -> [ranks that ghost it]. + // 3) For each locally-owned element gid, lookup ranks that ghost it and record targets. + // -------------------------------------------------------------------------------------- + std::vector>> boundary_elem_targets(output_mesh.num_owned_elems); + + // Prepare local ghost list as vector + std::vector ghost_gids_vec; + ghost_gids_vec.reserve(output_mesh.num_ghost_elems); + for (int i = 0; i < output_mesh.num_ghost_elems; i++) { + ghost_gids_vec.push_back(output_mesh.local_to_global_elem_mapping.host(output_mesh.num_owned_elems + i)); // Ghost elements are after the owned elements in the global element mapping + } + + // Exchange counts + std::vector ghost_counts(world_size, 0); + int local_ghost_count = output_mesh.num_ghost_elems; + MPI_Allgather(&local_ghost_count, 1, MPI_INT, ghost_counts.data(), 1, MPI_INT, MPI_COMM_WORLD); + + // Displacements and recv buffer + std::vector ghost_displs(world_size, 0); + int total_ghosts = 0; + for (int r = 0; r < world_size; r++) { + ghost_displs[r] = total_ghosts; + total_ghosts += ghost_counts[r]; + } + std::vector all_ghost_gids(total_ghosts); + + // Gather ghost gids + MPI_Allgatherv(ghost_gids_vec.data(), local_ghost_count, MPI_UNSIGNED_LONG_LONG, + all_ghost_gids.data(), ghost_counts.data(), ghost_displs.data(), + MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); + + + // Build map gid -> ranks that ghost it + std::unordered_map> gid_to_ghosting_ranks; + gid_to_ghosting_ranks.reserve(static_cast(total_ghosts)); + for (int r = 0; r < world_size; r++) { + int cnt = ghost_counts[r]; + int off = ghost_displs[r]; + for (int i = 0; i < cnt; i++) { + size_t g = all_ghost_gids[off + i]; + gid_to_ghosting_ranks[g].push_back(r); + } + } + + // For each local element, list destinations: ranks that ghost our gid + for (int elem_lid = 0; elem_lid < output_mesh.num_owned_elems; elem_lid++) { + size_t local_elem_gid = output_mesh.local_to_global_elem_mapping.host(elem_lid); + auto it = gid_to_ghosting_ranks.find(local_elem_gid); + if (it == gid_to_ghosting_ranks.end()) continue; + const std::vector &dest_ranks = it->second; + for (int rr : dest_ranks) { + if (rr == rank) continue; + boundary_elem_targets[elem_lid].push_back(std::make_pair(rr, local_elem_gid)); + } + } + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<"After boundary_elem_targets"< boundary_elem_local_ids; + std::vector> boundary_to_ghost_ranks; // ragged array dimensions (num_boundary_elems, num_ghost_ranks) + + std::set ghost_comm_ranks; // set of ranks that this rank communicates with + + + for (int elem_lid = 0; elem_lid < output_mesh.num_owned_elems; elem_lid++) { + + int local_elem_gid = output_mesh.local_to_global_elem_mapping.host(elem_lid); + if (boundary_elem_targets[elem_lid].empty()) + { + continue; + } + else + { + // Fill in vector of boundary local_ids + boundary_elem_local_ids.push_back(elem_lid); + std::vector ghost_ranks_for_this_boundary_elem; + for (const auto &pr : boundary_elem_targets[elem_lid]) { + ghost_ranks_for_this_boundary_elem.push_back(pr.first); + ghost_comm_ranks.insert(pr.first); + } + boundary_to_ghost_ranks.push_back(ghost_ranks_for_this_boundary_elem); + } + } + + int num_ghost_comm_ranks = ghost_comm_ranks.size(); + std::vector ghost_comm_ranks_vec(num_ghost_comm_ranks); + int i = 0; + for (const auto &r : ghost_comm_ranks) { + ghost_comm_ranks_vec[i] = r; + i++; + } + + + MPI_Barrier(MPI_COMM_WORLD); + + output_mesh.num_boundary_elems = boundary_elem_local_ids.size(); + output_mesh.boundary_elem_local_ids = DCArrayKokkos(output_mesh.num_boundary_elems, "boundary_elem_local_ids"); + for (int i = 0; i < output_mesh.num_boundary_elems; i++) { + output_mesh.boundary_elem_local_ids.host(i) = boundary_elem_local_ids[i]; + } + output_mesh.boundary_elem_local_ids.update_device(); + + print_info = false; + + + MPI_Barrier(MPI_COMM_WORLD); + + std::map> node_set_to_send_by_rank; + + // For each owned element that will be ghosted on other ranks, + // collect the nodes that need to be sent to those ranks + // boundary_elem_targets[elem_lid] contains pairs (rank, elem_gid) for ranks that ghost this element + for (int elem_lid = 0; elem_lid < input_mesh.num_elems; elem_lid++) { + // Get ranks that will ghost this element + for (const auto& pair : boundary_elem_targets[elem_lid]) { + int ghosting_rank = pair.first; + + // For each node in this element + for (int j = 0; j < nodes_per_elem; j++) { + size_t node_lid = input_mesh.nodes_in_elem.host(elem_lid, j); + size_t node_gid = input_mesh.local_to_global_node_mapping.host(node_lid); + + // Only send nodes that are NOT shared (not on MPI rank boundary) + // Shared nodes are already known to both ranks + if (shared_nodes_on_ranks[ghosting_rank].find(node_gid) == shared_nodes_on_ranks[ghosting_rank].end()) { // WARNING: THIS SHOULD BE MOFIFIED TO ONLY FILTER SHARED NODES WITH THIS SPECIFIC RANK + node_set_to_send_by_rank[ghosting_rank].insert(node_gid); + } + } + } + } + + MPI_Barrier(MPI_COMM_WORLD); + + std::map> nodes_to_send_by_rank; // rank -> list of global node indices + + // Copy the node_set_to_send_by_rank map to nodes_to_send_by_rank + for (const auto& [dest_rank, node_gids] : node_set_to_send_by_rank) { + for (size_t node_gid : node_gids) { + nodes_to_send_by_rank[dest_rank].push_back(node_gid); + } + } + + // Initialize graph comms for elements + // MPI_Dist_graph_create_adjacent creates a distributed graph topology communicator + // that efficiently represents the communication pattern between ranks. + // This allows MPI to optimize communication based on the actual connectivity pattern. + + + // ---------- Prepare INCOMING edges (sources) ---------- + // indegree: Number of ranks from which this rank will RECEIVE data + // These are the ranks that own elements which are ghosted on this rank + std::vector ghost_elem_receive_ranks_vec(ghost_elem_receive_ranks.begin(), + ghost_elem_receive_ranks.end()); + // The number of ranks from which this rank will receive data (incoming neighbors) + int elem_indegree = static_cast(ghost_elem_receive_ranks_vec.size()); + + // sources: Array of source rank IDs (ranks we receive from) + // Each element corresponds to a rank that owns elements we ghost + int* sources = (elem_indegree > 0) ? ghost_elem_receive_ranks_vec.data() : MPI_UNWEIGHTED; + + + // sourceweights: Weights on incoming edges (not used here, set to MPI_UNWEIGHTED) + // Could be used to specify communication volume if needed for optimization + // int* sourceweights = MPI_UNWEIGHTED; + + // ---------- Prepare OUTGOING edges (destinations) ---------- + // outdegree: Number of ranks to which this rank will SEND data + // These are the ranks that ghost elements owned by this rank + int outdegree = num_ghost_comm_ranks; + + // destinations: Array of destination rank IDs (ranks we send to) + // Each element corresponds to a rank that ghosts our owned elements + int* destinations = (outdegree > 0) ? ghost_comm_ranks_vec.data() : MPI_UNWEIGHTED; + + // Initialize the graph communicator for element communication + element_communication_plan.initialize_graph_communicator(outdegree, ghost_comm_ranks_vec.data(), elem_indegree, ghost_elem_receive_ranks_vec.data()); + MPI_Barrier(MPI_COMM_WORLD); + + // Optional: Verify the graph communicator was created successfully + // if(print_info) element_communication_plan.verify_graph_communicator(); + + + // Initialize graph comms for nodes + // ---------- Prepare INCOMING edges (sources) ---------- + // indegree: Number of ranks from which this rank will RECEIVE data + // These are the ranks that own nodes which are ghosted on this rank + int node_indegree = static_cast(ghost_node_receive_ranks.size()); + int* node_sources = (node_indegree > 0) ? ghost_node_receive_ranks_vec.data() : MPI_UNWEIGHTED; + + // sourceweights: Weights on incoming edges (not used here, set to MPI_UNWEIGHTED) + //int* node_sourceweights = MPI_UNWEIGHTED; + + // ---------- Prepare OUTGOING edges (destinations) ---------- + // outdegree: Number of ranks to which this rank will SEND data + // These are the ranks that ghost nodes owned by this rank + int node_outdegree = static_cast(ghost_node_send_ranks.size()); + int* node_destinations = (node_outdegree > 0) ? ghost_node_send_ranks_vec.data() : MPI_UNWEIGHTED; + + // destinationweights: Weights on outgoing edges (not used here, set to MPI_UNWEIGHTED) + // int* node_destinationweights = MPI_UNWEIGHTED; + + // Initialize the graph communicator for node communication + node_communication_plan.initialize_graph_communicator(node_outdegree, node_destinations, node_indegree, node_sources); + MPI_Barrier(MPI_COMM_WORLD); + if (rank == 0) std::cout<<"After node graph communicator"< elem_sendcounts(element_communication_plan.num_send_ranks, 0); + // std::vector elem_sdispls(element_communication_plan.num_send_ranks, 0); + + // Count how many boundary elements go to each destination rank + // boundary_elem_targets[elem_lid] contains pairs (dest_rank, elem_gid) for each boundary element + std::map> elems_to_send_by_rank; // rank -> list of boundary element local IDs + + for (int elem_lid = 0; elem_lid < input_mesh.num_elems; elem_lid++) { + if (!boundary_elem_targets[elem_lid].empty()) { + for (const auto &pr : boundary_elem_targets[elem_lid]) { + int dest_rank = pr.first; + elems_to_send_by_rank[dest_rank].push_back(elem_lid); + } + } + } + + // Serialize into a DRaggedRightArrayKokkos + DCArrayKokkos strides_array(element_communication_plan.num_send_ranks, "strides_for_elems_to_send"); + for (int i = 0; i < element_communication_plan.num_send_ranks; i++) { + int dest_rank = element_communication_plan.send_rank_ids.host(i); + strides_array.host(i) = elems_to_send_by_rank[dest_rank].size(); + } + strides_array.update_device(); + DRaggedRightArrayKokkos elems_to_send_by_rank_rr(strides_array, "elems_to_send_by_rank"); + + // Fill in the data + for (int i = 0; i < element_communication_plan.num_send_ranks; i++) { + int dest_rank = element_communication_plan.send_rank_ids.host(i); + for (int j = 0; j < elems_to_send_by_rank[dest_rank].size(); j++) { + elems_to_send_by_rank_rr.host(i, j) = elems_to_send_by_rank[dest_rank][j]; + } + } + elems_to_send_by_rank_rr.update_device(); + + + // Count how many ghost elements come from each source rank + // ghost_elem_owner_ranks[i] tells us which rank owns the i-th ghost element + std::map> elems_to_recv_by_rank; // rank -> list of ghost element indices + + for (size_t i = 0; i < ghost_elem_owner_ranks.size(); i++) { + int source_rank = ghost_elem_owner_ranks[i]; + int ghost_elem_local_id = output_mesh.num_owned_elems + i; + elems_to_recv_by_rank[source_rank].push_back(ghost_elem_local_id); + } + + // ========== Serialize into a DRaggedRightArrayKokkos ========== + DCArrayKokkos elem_recv_strides_array(element_communication_plan.num_recv_ranks, "elem_recv_strides_array"); + for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) { + int source_rank = element_communication_plan.recv_rank_ids.host(i); + elem_recv_strides_array.host(i) = elems_to_recv_by_rank[source_rank].size(); + + } + elem_recv_strides_array.update_device(); + DRaggedRightArrayKokkos elems_to_recv_by_rank_rr(elem_recv_strides_array, "elems_to_recv_by_rank"); + // Fill in the data + for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) { + int source_rank = element_communication_plan.recv_rank_ids.host(i); + for (int j = 0; j < elems_to_recv_by_rank[source_rank].size(); j++) { + elems_to_recv_by_rank_rr.host(i, j) = elems_to_recv_by_rank[source_rank][j]; + } + } + elems_to_recv_by_rank_rr.update_device(); + MATAR_FENCE(); + element_communication_plan.setup_send_recv(elems_to_send_by_rank_rr, elems_to_recv_by_rank_rr); + + MPI_Barrier(MPI_COMM_WORLD); + + // -------------------------------------------------------------------------------------- + // Build the send pattern for nodes + // -------------------------------------------------------------------------------------- + // Build reverse map via global IDs: for each local node gid, find ranks that ghost it. + // Steps: + // 1) Each rank contributes its ghost node GIDs. + // 2) Allgatherv ghost node GIDs to build gid -> [ranks that ghost it]. + // 3) For each locally-owned node gid, lookup ranks that ghost it and record targets. + // -------------------------------------------------------------------------------------- + + // Serialize into a DRaggedRightArrayKokkos + DCArrayKokkos node_send_strides_array(node_communication_plan.num_send_ranks,"node_send_strides_array"); + for (int i = 0; i < node_communication_plan.num_send_ranks; i++) { + int dest_rank = node_communication_plan.send_rank_ids.host(i); + node_send_strides_array.host(i) = nodes_to_send_by_rank[dest_rank].size(); + } + node_send_strides_array.update_device(); + DRaggedRightArrayKokkos nodes_to_send_by_rank_rr(node_send_strides_array, "nodes_to_send_by_rank"); + + // Fill in the data + for (int i = 0; i < node_communication_plan.num_send_ranks; i++) { + int dest_rank = node_communication_plan.send_rank_ids.host(i); + for (int j = 0; j < nodes_to_send_by_rank[dest_rank].size(); j++) { + int node_gid = nodes_to_send_by_rank[dest_rank][j]; + int node_lid = node_gid_to_extended_lid[node_gid]; + nodes_to_send_by_rank_rr.host(i, j) = node_lid; + } + } + nodes_to_send_by_rank_rr.update_device(); + + // For each ghost element, determine which nodes need to be received from the owning rank + // Build the receive list based on ghost element nodes, not on ghost_node_gids + // This ensures we receive all nodes needed by ghost elements + std::map> node_set_to_recv_by_rank; // rank -> set of node GIDs to receive + + for (int i = 0; i < output_mesh.num_ghost_elems; i++) { + int ghost_elem_lid = output_mesh.num_owned_elems + i; + size_t ghost_elem_gid = output_mesh.local_to_global_elem_mapping.host(ghost_elem_lid); + int owning_rank = elem_gid_to_rank.at(ghost_elem_gid); + + // Collect all nodes in this ghost element + for (int j = 0; j < nodes_per_elem; j++) { + size_t node_lid = output_mesh.nodes_in_elem.host(ghost_elem_lid, j); + size_t node_gid = output_mesh.local_to_global_node_mapping.host(node_lid); + + // Only receive nodes that: + // 1. We don't own (not in local_node_gids) + // 2. Are NOT shared (not on MPI rank boundary) + // Shared nodes are already known to both ranks via element connectivity + if (local_node_gids.find(node_gid) == local_node_gids.end() && + shared_nodes_on_ranks[owning_rank].find(node_gid) == shared_nodes_on_ranks[owning_rank].end()) { + node_set_to_recv_by_rank[owning_rank].insert(node_gid); + } + } + } + + // Convert node GIDs to local indices and build nodes_to_recv_by_rank + std::map> nodes_to_recv_by_rank; // rank -> list of ghost node local indices + std::map node_gid_to_ghost_lid; // map ghost node GID to its local index in output_mesh + + // Build the GID->local index mapping for ALL ghost nodes in output_mesh + // Ghost nodes are those with local IDs >= num_owned_nodes + for (int i = output_mesh.num_owned_nodes; i < output_mesh.num_nodes; i++) { + size_t node_gid = output_mesh.local_to_global_node_mapping.host(i); + node_gid_to_ghost_lid[node_gid] = i; + } + + // Now convert the GID sets to local index vectors + for (const auto& pair : node_set_to_recv_by_rank) { + int source_rank = pair.first; + const std::set& node_gids = pair.second; + + for (size_t node_gid : node_gids) { + auto it = node_gid_to_ghost_lid.find(node_gid); + if (it != node_gid_to_ghost_lid.end()) { + nodes_to_recv_by_rank[source_rank].push_back(it->second); + } + } + } + + // Serialize into a DRaggedRightArrayKokkos + DCArrayKokkos nodes_recv_strides_array(node_communication_plan.num_recv_ranks, "nodes_recv_strides_array"); + for (int i = 0; i < node_communication_plan.num_recv_ranks; i++) { + int source_rank = node_communication_plan.recv_rank_ids.host(i); + nodes_recv_strides_array.host(i) = nodes_to_recv_by_rank[source_rank].size(); + } + nodes_recv_strides_array.update_device(); + DRaggedRightArrayKokkos nodes_to_recv_by_rank_rr(nodes_recv_strides_array, "nodes_to_recv_by_rank"); + // Fill in the data + for (int i = 0; i < node_communication_plan.num_recv_ranks; i++) { + int source_rank = node_communication_plan.recv_rank_ids.host(i); + for (int j = 0; j < nodes_to_recv_by_rank[source_rank].size(); j++) { + size_t node_gid = nodes_to_recv_by_rank[source_rank][j]; + size_t local_id = node_gid_to_extended_lid[node_gid]; + + nodes_to_recv_by_rank_rr.host(i, j) = nodes_to_recv_by_rank[source_rank][j]; + } + } + nodes_to_recv_by_rank_rr.update_device(); + + MPI_Barrier(MPI_COMM_WORLD); + + node_communication_plan.setup_send_recv(nodes_to_send_by_rank_rr, nodes_to_recv_by_rank_rr); + MPI_Barrier(MPI_COMM_WORLD); + + // node_communication_plan.verify_send_recv(); + +} + + +/** + * @brief Partitions the input mesh using PT-Scotch and constructs the final distributed mesh. + * + * This function performs parallel mesh partitioning using a two-stage approach: + * 1. A naive partition is first constructed (simple assignment of mesh elements/nodes across ranks). + * 2. PT-Scotch is then used to repartition the mesh for load balancing and improved connectivity. + * + * The partitioned mesh, nodal data, and associated connectivity/gauss point information + * are distributed among MPI ranks as a result. The procedure ensures that each rank receives + * its assigned portion of the mesh and associated data in the final (target) decomposition. + * + * @param initial_mesh[in] The input (global) mesh, present on rank 0 or all ranks at start. + * @param final_mesh[out] The mesh assigned to this rank after PT-Scotch decomposition. + * @param initial_node[in] Nodal data for the input (global) mesh; must match initial_mesh. + * @param final_node[out] Nodal data for this rank after decomposition (corresponds to final_mesh). + * @param gauss_point[out] Gauss point data structure, filled out for this rank's mesh. + * @param world_size[in] Number of MPI ranks in use (the total number of partitions). + * @param rank[in] This process's MPI rank ID. + * + * Internals: + * - The routine uses a naive_partition_mesh() helper to create an initial contiguous mesh partition. + * - It then uses PT-Scotch distributed graph routines to compute an improved partition and create the final mesh layout. + * - Both element-to-element and node-to-element connectivity, as well as mapping and ghosting information, + * are managed and exchanged across ranks. + * - MPI routines synchronize and exchange the relevant mesh and nodal data following the computed partition. + */ + +void partition_mesh( + Mesh_t& initial_mesh, + Mesh_t& final_mesh, + node_t& initial_node, + node_t& final_node, + GaussPoint_t& gauss_point, + int world_size, + int rank){ + + bool print_info = false; + // bool print_vtk = false; + + int num_dim = initial_mesh.num_dims; + + // Create mesh, gauss points, and node data structures on each rank + // This is the initial partitioned mesh + Mesh_t naive_mesh; + node_t naive_node; + + // Mesh partitioned by pt-scotch, not including ghost + Mesh_t intermediate_mesh; + node_t intermediate_node; + + // Helper arrays to hold element-element connectivity for naive partitioning that include what would be ghost, without having to build the full mesh + CArrayDual elems_in_elem_on_rank; + CArrayDual num_elems_in_elem_per_rank; + + + // Perform the naive partitioning of the mesh + MPI_Barrier(MPI_COMM_WORLD); + if (rank == 0) std::cout << "Performing the naive partitioning of the mesh" << std::endl; + naive_partition_mesh(initial_mesh, initial_node, naive_mesh, naive_node, elems_in_elem_on_rank, num_elems_in_elem_per_rank, world_size, rank); + MPI_Barrier(MPI_COMM_WORLD); + if (rank == 0) std::cout << "Begin repartitioning using PT-Scotch" << std::endl; + + /********************************************************************************** + * Build PT-Scotch distributed graph representation of the mesh for repartitioning * + ********************************************************************************** + * + * This section constructs the distributed graph (SCOTCH_Dgraph) needed by PT-Scotch + * for mesh repartitioning. In this graph, each mesh element is a vertex, and edges + * correspond to mesh-neighbor relationships (i.e., elements that share a face or are + * otherwise neighbors per your mesh definition). + * + * We use the compact CSR (Compressed Sparse Row) representation, passing only the + * essential information required by PT-Scotch. + * + * Variables and structures used: + * - SCOTCH_Dgraph dgraph: + * The distributed graph instance managed by PT-Scotch. Each MPI rank creates + * and fills in its portion of the global graph. + * + * - const SCOTCH_Num baseval: + * The base value for vertex and edge numbering. Set to 0 for C-style zero-based + * arrays. Always use 0 unless you are using Fortran style 1-based arrays. + * + * - const SCOTCH_Num vertlocnbr: + * The *number of local vertices* (mesh elements) defined on this MPI rank. + * In our mesh, this is mesh.num_elems. PT-Scotch expects each rank to specify + * its own local vertex count. + * + * - const SCOTCH_Num vertlocmax: + * The *maximum number of local vertices* that could be stored (capacity). We + * allocate with no unused holes, so vertlocmax = vertlocnbr. + * + * - std::vector vertloctab: + * CSR array [size vertlocnbr+1]: for each local vertex i, vertloctab[i] + * gives the index in edgeloctab where the neighbor list of vertex i begins. + * PT-Scotch expects this array to be of size vertlocnbr+1, where the difference + * vertloctab[i+1] - vertloctab[i] gives the number of edges for vertex i. + * + * - std::vector edgeloctab: + * CSR array [variable size]: a flattened list of *neighboring element global IDs*, + * in no particular order. For vertex i, its neighbors are located at + * edgeloctab[vertloctab[i]...vertloctab[i+1]-1]. + * In this compact CSR, these are global IDs (GIDs), enabling PT-Scotch to + * recognize edges both within and across ranks. + * + * - std::map elem_gid_to_offset: + * Helper map: For a given element global ID, gives the starting offset in + * the flattened neighbor array (elems_in_elem_on_rank) where this element's + * list of neighbor GIDs begins. This allows efficient neighbor list lookup. + * + * - (other arrays used, from mesh setup and communication phase) + * - elements_on_rank: vector of global element IDs owned by this rank. + * - num_elements_on_rank: number of owned elements. + * - num_elems_in_elem_per_rank: array, for each owned element, how many + * neighbors it has. + * - elems_in_elem_on_rank: flattened array of global neighbor IDs for all local elements. + * + **********************************************************************************/ + + // --- Step 1: Initialize the PT-Scotch distributed graph object on this MPI rank --- + SCOTCH_Dgraph dgraph; + if (SCOTCH_dgraphInit(&dgraph, MPI_COMM_WORLD) != 0) { + std::cerr << "[rank " << rank << "] SCOTCH_dgraphInit failed\n"; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + // Set base value for numbering (0 for C-style arrays) + const SCOTCH_Num baseval = 0; + + // vertlocnbr: Number of elements (vertices) that are local to this MPI rank + const SCOTCH_Num vertlocnbr = static_cast(naive_mesh.num_elems); + + // vertlocmax: Maximum possible local vertices (no holes, so identical to vertlocnbr) + const SCOTCH_Num vertlocmax = vertlocnbr; + + // --- Step 2: Build compact CSR arrays for PT-Scotch (vertloctab, edgeloctab) --- + // vertloctab: for each local mesh element [vertex], gives index in edgeloctab where its neighbor list begins + std::vector vertloctab(vertlocnbr + 1); + + // edgeloctab: flat array of neighbor global IDs for all local elements, built in order + std::vector edgeloctab; + // edgeloctab holds the flattened list of all neighbors (edges) for all local elements, + // in a compact CSR (Compressed Sparse Row) format expected by PT-Scotch. Each entry is a global element ID + // of a neighbor. The edgeloctab array is built incrementally with one entry per element neighbor edge, + // so we reserve its capacity up front for efficiency. + // + // Heuristic: For unstructured 3D hexahedral meshes, a single element can have significantly more neighbors + // than in 2D cases. In a fully structured 3D grid, each hexahedral element can have up to 26 neighbors + // (since it may touch all surrounding elements along all axes). In unstructured grids, it's possible for some + // elements to have even more neighbors due to mesh irregularities and refinements. + // + // For most practical unstructured hexahedral meshes, values in the low 20s are common, but extreme cases + // (e.g., high-order connectivity, pathological splits, or meshes with "hanging nodes") may see higher counts. + // Using vertlocnbr * 26 as an upper limit is a reasonable estimate for fully connected (structured) cases, + // but consider increasing this if working with highly unstructured or pathological meshes. For safety and + // to avoid repeated reallocations during construction, we use 26 here as a conservative guess. + edgeloctab.reserve(vertlocnbr * 26); + + // Construct a map from element GID to its offset into elems_in_elem_on_rank (the array of neighbor GIDs) + // This allows, for a given element GID, quick lookup of where its neighbor list starts in the flat array. + std::map elem_gid_to_offset; + size_t current_offset = 0; + for (size_t k = 0; k < naive_mesh.num_elems; k++) { + int elem_gid_on_rank = naive_mesh.local_to_global_elem_mapping.host(k); + elem_gid_to_offset[elem_gid_on_rank] = current_offset; + current_offset += num_elems_in_elem_per_rank.host(k); + } + + // --- Step 3: Fill in the CSR arrays, looping over each locally-owned element --- + SCOTCH_Num offset = 0; // running count of edges encountered + + for (size_t lid = 0; lid < naive_mesh.num_elems; lid++) { + + // Record current edge offset for vertex lid in vertloctab + vertloctab[lid] = offset; + + // Obtain this local element's global ID (from mapping) + int elem_gid = naive_mesh.local_to_global_elem_mapping.host(lid); + + // Find offset in the flattened neighbor array for this element's neighbor list + size_t elems_in_elem_offset = elem_gid_to_offset[elem_gid]; + + // For this element, find the count of its neighbors + // This requires finding its index in the elements_on_rank array + size_t idx = 0; + for (size_t k = 0; k < naive_mesh.num_elems; k++) { + int elem_gid_on_rank = naive_mesh.local_to_global_elem_mapping.host(k); + if (elem_gid_on_rank == elem_gid) { + idx = k; + break; + } + } + size_t num_nbrs = num_elems_in_elem_per_rank.host(idx); + + // Append each neighbor (by its GLOBAL elem GID) to edgeloctab + for (size_t j = 0; j < num_nbrs; j++) { + size_t neighbor_gid = elems_in_elem_on_rank.host(elems_in_elem_offset + j); // This is a global element ID! + edgeloctab.push_back(static_cast(neighbor_gid)); + ++offset; // Increment running edge count + } + } + + // vertloctab[vertlocnbr] stores total number of edges written, finalizes the CSR structure + vertloctab[vertlocnbr] = offset; + + // edgelocnbr/edgelocsiz: Number of edge endpoints defined locally + // (PT-Scotch's distributed graphs allow edges to be replicated or owned by either endpoint) + const SCOTCH_Num edgelocnbr = offset; // total number of edge endpoints (sum of all local neighbor degrees) + const SCOTCH_Num edgelocsiz = edgelocnbr; // allocated size matches number of endpoints + + // Optionally print graph structure for debugging/validation + if (print_info) { + std::cout << "Rank " << rank << ": vertlocnbr = # of local elements(vertices) = " << vertlocnbr + << ", edgelocnbr = # of local edge endpoints = " << edgelocnbr << std::endl; + std::cout << "vertloctab (CSR row offsets): "; + for (size_t i = 0; i <= vertlocnbr; i++) { + std::cout << vertloctab[i] << " "; + } + std::cout << std::endl; + std::cout << "edgeloctab (first 20 neighbor GIDs): "; + for (size_t i = 0; i < std::min((size_t)20, edgeloctab.size()); i++) { + std::cout << edgeloctab[i] << " "; + } + std::cout << std::endl; + } + MPI_Barrier(MPI_COMM_WORLD); + + /************************************************************************** + * Step 4: Build the distributed graph using PT-Scotch's SCOTCH_dgraphBuild + * + * - PT-Scotch will use our CSR arrays. Since we use compact representation, + * most optional arrays ("veloloctab", "vlblloctab", "edgegsttab", "edloloctab") + * can be passed as nullptr. + * - edgeloctab contains *GLOBAL element GIDs* of neighbors. PT-Scotch uses this + * to discover connections across processor boundaries, so you do not have to + * encode ownership or partition information yourself. + **************************************************************************/ + int rc = SCOTCH_dgraphBuild( + &dgraph, + baseval, // start index (0) + vertlocnbr, // local vertex count (local elements) + vertlocmax, // local vertex max (no holes) + vertloctab.data(), // row offsets in edgeloctab + /*vendloctab*/ nullptr, // end of row offsets (compact CSR => nullptr) + /*veloloctab*/ nullptr, // vertex weights, not used + /*vlblloctab*/ nullptr, // vertex global labels (we use GIDs in edgeloctab) + edgelocnbr, // local edge endpoints count + edgelocsiz, // size of edge array + edgeloctab.data(), // global neighbor IDs for each local node + /*edgegsttab*/ nullptr, // ghost edge array, not used + /*edloloctab*/ nullptr // edge weights, not used + ); + if (rc != 0) { + std::cerr << "[rank " << rank << "] SCOTCH_dgraphBuild failed rc=" << rc << "\n"; + SCOTCH_dgraphFree(&dgraph); + MPI_Abort(MPI_COMM_WORLD, rc); + } + + // Optionally, print rank summary after graph build for further validation + if (print_info) { + SCOTCH_Num vertlocnbr_out; + SCOTCH_dgraphSize(&dgraph, &vertlocnbr_out, nullptr, nullptr, nullptr); + std::cout << "Rank " << rank << ": After dgraphBuild, vertlocnbr = " << vertlocnbr_out << std::endl; + } + MPI_Barrier(MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" Finished building the distributed graph using PT-Scotch"<(world_size)); + + // ===================== PT-Scotch Strategy Selection and Documentation ====================== + // The PT-Scotch "strategy" (stratdat here) controls the algorithms and heuristics used for partitioning. + // You can specify a string or build a strategy using functions that adjust speed, quality, and recursion. + // + // Common strategy flags (see "scotch.h", "ptscotch.h", and PT-Scotch documentation): + // + // - SCOTCH_STRATDEFAULT: Use the default (fast, reasonable quality) partitioning strategy. + // Useful for quick, generic partitions where quality is not critical. + // + // - SCOTCH_STRATSPEED: Aggressively maximizes speed (at the cost of cut quality). + // For large runs or test runs where speed is more important than minimizing edgecut. + // + // - SCOTCH_STRATQUALITY: Prioritizes partition *quality* (minimizing edge cuts, maximizing load balance). + // Slower than the default. Use when high-quality partitioning is desired. + // + // - SCOTCH_STRATBALANCE: Tradeoff between speed and quality for balanced workload across partitions. + // Use if load balance is more critical than cut size. + // + // Additional Options: + // - Strategy can also be specified as a string (see Scotch manual, e.g., "b{sep=m{...} ...}"). + // - Recursion count parameter (here, set to 0) controls strategy recursion depth (0 = automatic). + // - Imbalance ratio (here, 0.01) allows minor imbalance in part weight for better cut quality. + // + // Example usage: + // SCOTCH_stratDgraphMapBuild(&strat, SCOTCH_STRATQUALITY, nparts, 0, 0.01); + // ^ quality-focused, nparts=number of parts/ranks + // SCOTCH_stratDgraphMapBuild(&strat, SCOTCH_STRATSPEED, nparts, 0, 0.05); + // ^ speed-focused, allow 5% imbalance + // + // Reference: + // - https://gitlab.inria.fr/scotch/scotch/-/blob/master/doc/libptscotch.pdf + // - SCOTCH_stratDgraphMapBuild() and related "strategy" documentation. + // + // --------------- Set up the desired partitioning strategy here: --------------- + SCOTCH_Strat stratdat; // PT-Scotch strategy object: holds partitioning options/settings + SCOTCH_stratInit(&stratdat); + + // Select partitioning strategy for this run: + // Use SCOTCH_STRATQUALITY for best cut quality. + // To change: replace with SCOTCH_STRATDEFAULT, SCOTCH_STRATSPEED, or SCOTCH_STRATBALANCE as discussed above. + // Arguments: (strategy object, strategy flag, #parts, recursion (0=auto), imbalance ratio) + SCOTCH_stratDgraphMapBuild(&stratdat, SCOTCH_STRATQUALITY, world_size, 0, 0.001); + + // partloctab: output array mapping each local element (vertex) to a *target partition number* + // After partitioning, partloctab[i] gives the part-assignment (in [0,world_size-1]) for local element i. + std::vector partloctab(vertlocnbr); + rc = SCOTCH_dgraphMap(&dgraph, &archdat, &stratdat, partloctab.data()); + if (rc != 0) { + std::cerr << "[rank " << rank << "] SCOTCH_dgraphMap failed rc=" << rc << "\n"; + SCOTCH_stratExit(&stratdat); + SCOTCH_archExit(&archdat); + SCOTCH_dgraphFree(&dgraph); + MPI_Abort(MPI_COMM_WORLD, rc); + } + + // Clean up PT-Scotch strategy and architecture objects + SCOTCH_stratExit(&stratdat); + SCOTCH_archExit(&archdat); + + // Free the graph now that we have the partition assignments + SCOTCH_dgraphFree(&dgraph); + + /*************************************************************************** + * Step 7 (Optional): Print out the partitioning assignment per element + * - Each local element's local index lid and global ID (gid) are listed with the + * part to which PT-Scotch has assigned them. + ***************************************************************************/ + print_info = false; + for(int rank_id = 0; rank_id < world_size; rank_id++) { + if(rank_id == rank && print_info) { + for (size_t lid = 0; lid < naive_mesh.num_elems; lid++) { + size_t gid = naive_mesh.local_to_global_elem_mapping.host(lid); + std::cout << "[rank " << rank_id << "] elem_local=" << lid << " gid=" << gid + << " -> part=" << partloctab[lid] << "\n"; + } + MPI_Barrier(MPI_COMM_WORLD); + } + MPI_Barrier(MPI_COMM_WORLD); + } + print_info = false; + +// ****************************************************************************************** +// Build the intermediate mesh (without ghost nodes and elements) from the repartition +// ****************************************************************************************** + + + + MPI_Barrier(MPI_COMM_WORLD); + if (rank == 0) std::cout << "\n=== Starting Mesh Redistribution Phase ===\n"; + MPI_Barrier(MPI_COMM_WORLD); + + // -------------- Phase 1: Determine elements to send to each rank -------------- + std::vector> elems_to_send(world_size); + for (int lid = 0; lid < naive_mesh.num_elems; lid++) { + int dest = static_cast(partloctab[lid]); + int elem_gid = static_cast(naive_mesh.local_to_global_elem_mapping.host(lid)); + elems_to_send[dest].push_back(elem_gid); + } + + // -------------- Phase 2: Exchange element GIDs -------------- + std::vector sendcounts(world_size), recvcounts(world_size); + for (int r = 0; r < world_size; r++) + sendcounts[r] = static_cast(elems_to_send[r].size()); + + MPI_Alltoall(sendcounts.data(), 1, MPI_INT, recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); + + // Compute displacements + std::vector sdispls(world_size), rdispls(world_size); + int send_total = 0, recv_total = 0; + for (int r = 0; r < world_size; r++) { + sdispls[r] = send_total; + rdispls[r] = recv_total; + send_total += sendcounts[r]; + recv_total += recvcounts[r]; + } + + + // Flatten send buffer + // send_elems: flattened list of element global IDs (GIDs) that this rank is sending to all other ranks. + // For each rank r, elems_to_send[r] contains the element GIDs that should be owned by rank r after repartitioning. + std::vector send_elems; + send_elems.reserve(send_total); + for (int r = 0; r < world_size; r++) + send_elems.insert(send_elems.end(), elems_to_send[r].begin(), elems_to_send[r].end()); + + // new_elem_gids: receives the list of new element global IDs this rank will own after the exchange. + // It is filled after MPI_Alltoallv completes, and contains the GIDs for the elements new to (or remained on) this rank. + std::vector new_elem_gids(recv_total); + MPI_Alltoallv(send_elems.data(), sendcounts.data(), sdispls.data(), MPI_INT, + new_elem_gids.data(), recvcounts.data(), rdispls.data(), MPI_INT, MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); + + // New elements owned by this rank + int num_new_elems = static_cast(new_elem_gids.size()); + + // -------------- Phase 3: Send element–node connectivity -------------- + int nodes_per_elem = naive_mesh.num_nodes_in_elem; + + // Flatten element-node connectivity by global node IDs + std::vector conn_sendbuf; + for (int r = 0; r < world_size; r++) { + for (int elem_gid : elems_to_send[r]) { + // find local element lid from elem_gid + int lid = -1; + for (int i = 0; i < naive_mesh.num_elems; i++) + if (naive_mesh.local_to_global_elem_mapping.host(i) == elem_gid) { lid = i; break; } + + for (int j = 0; j < nodes_per_elem; j++) { + int node_lid = naive_mesh.nodes_in_elem.host(lid, j); + int node_gid = naive_mesh.local_to_global_node_mapping.host(node_lid); + conn_sendbuf.push_back(node_gid); + } + } + } + + // element-node connectivity counts (ints per dest rank) + std::vector conn_sendcounts(world_size), conn_recvcounts(world_size); + for (int r = 0; r < world_size; r++) + conn_sendcounts[r] = sendcounts[r] * nodes_per_elem; + + MPI_Alltoall(conn_sendcounts.data(), 1, MPI_INT, conn_recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD); + + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" Finished exchanging element–node connectivity counts"< conn_sdispls(world_size), conn_rdispls(world_size); + int conn_send_total = 0, conn_recv_total = 0; + for (int r = 0; r < world_size; r++) { + conn_sdispls[r] = conn_send_total; + conn_rdispls[r] = conn_recv_total; + conn_send_total += conn_sendcounts[r]; + conn_recv_total += conn_recvcounts[r]; + } + + std::vector conn_recvbuf(conn_recv_total); + MPI_Alltoallv(conn_sendbuf.data(), conn_sendcounts.data(), conn_sdispls.data(), MPI_INT, + conn_recvbuf.data(), conn_recvcounts.data(), conn_rdispls.data(), MPI_INT, MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" Finished exchanging element–node connectivity"< node_gid_set(conn_recvbuf.begin(), conn_recvbuf.end()); + std::vector new_node_gids(node_gid_set.begin(), node_gid_set.end()); + int num_new_nodes = static_cast(new_node_gids.size()); + + // Build map gid→lid + std::unordered_map node_gid_to_lid; + for (int i = 0; i < num_new_nodes; i++) + node_gid_to_lid[new_node_gids[i]] = i; + + if (print_info) + std::cout << "[rank " << rank << "] owns " << num_new_nodes << " unique nodes\n"; + + + // -------------- Phase 5: Request node coordinates -------------- + std::vector node_coords_sendbuf; + for (int r = 0; r < world_size; r++) { + for (int gid : elems_to_send[r]) { + int lid = -1; + for (int i = 0; i < naive_mesh.num_elems; i++) + if (naive_mesh.local_to_global_elem_mapping.host(i) == gid) { lid = i; break; } + + for (int j = 0; j < nodes_per_elem; j++) { + int node_lid = naive_mesh.nodes_in_elem.host(lid, j); + int node_gid = naive_mesh.local_to_global_node_mapping.host(node_lid); + + for(int dim = 0; dim < num_dim; dim++) { + node_coords_sendbuf.push_back(naive_node.coords.host(node_lid, dim)); + } + } + } + } + + // Each node is 3 doubles; same sendcounts scaling applies + std::vector coord_sendcounts(world_size), coord_recvcounts(world_size); + for (int r = 0; r < world_size; r++) + coord_sendcounts[r] = sendcounts[r] * nodes_per_elem * 3; + + MPI_Alltoall(coord_sendcounts.data(), 1, MPI_INT, coord_recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" Finished exchanging node coordinates counts"< coord_sdispls(world_size), coord_rdispls(world_size); + int coord_send_total = 0, coord_recv_total = 0; + for (int r = 0; r < world_size; r++) { + coord_sdispls[r] = coord_send_total; + coord_rdispls[r] = coord_recv_total; + coord_send_total += coord_sendcounts[r]; + coord_recv_total += coord_recvcounts[r]; + } + + std::vector coord_recvbuf(coord_recv_total); + MPI_Alltoallv(node_coords_sendbuf.data(), coord_sendcounts.data(), coord_sdispls.data(), MPI_DOUBLE, + coord_recvbuf.data(), coord_recvcounts.data(), coord_rdispls.data(), MPI_DOUBLE, MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" Finished exchanging node coordinates"<(num_new_nodes, "intermediate_mesh.local_to_global_node_mapping"); + intermediate_mesh.local_to_global_elem_mapping = DCArrayKokkos(num_new_elems, "intermediate_mesh.local_to_global_elem_mapping"); + + // Fill global mappings + for (int i = 0; i < num_new_nodes; i++) + intermediate_mesh.local_to_global_node_mapping.host(i) = new_node_gids[i]; + for (int i = 0; i < num_new_elems; i++) + intermediate_mesh.local_to_global_elem_mapping.host(i) = new_elem_gids[i]; + + intermediate_mesh.local_to_global_node_mapping.update_device(); + intermediate_mesh.local_to_global_elem_mapping.update_device(); + + // rebuild the local element-node connectivity using the local node ids + for(int i = 0; i < intermediate_mesh.num_elems; i++) { + for(int j = 0; j < intermediate_mesh.num_nodes_in_elem; j++) { + int node_gid = conn_recvbuf[i * intermediate_mesh.num_nodes_in_elem + j]; + + int node_lid = -1; + + // Binary search through local_to_global_node_mapping to find the equivalent local index + int left = 0, right = num_new_nodes - 1; + while (left <= right) { + int mid = left + (right - left) / 2; + size_t mid_gid = intermediate_mesh.local_to_global_node_mapping.host(mid); + if (node_gid == mid_gid) { + node_lid = mid; + break; + } else if (node_gid < mid_gid) { + right = mid - 1; + } else { + left = mid + 1; + } + } + intermediate_mesh.nodes_in_elem.host(i, j) = node_lid; + } + } + + intermediate_mesh.nodes_in_elem.update_device(); + + // Fill node coordinates + // coord_recvbuf contains coords in element-node order, but we need them in node order + // Build a map from node GID to coordinates + std::map> node_gid_to_coords; + int coord_idx = 0; + for (int e = 0; e < intermediate_mesh.num_elems; e++) { + for (int j = 0; j < intermediate_mesh.num_nodes_in_elem; j++) { + int node_gid = conn_recvbuf[e * intermediate_mesh.num_nodes_in_elem + j]; + if (node_gid_to_coords.find(node_gid) == node_gid_to_coords.end()) { + std::vector coords(num_dim); + for (int d = 0; d < num_dim; d++) { + coords[d] = coord_recvbuf[coord_idx * num_dim + d]; + } + node_gid_to_coords[node_gid] = coords; + } + coord_idx++; + } + } + + // Now fill coordinates in node order + intermediate_node.initialize(num_new_nodes, num_dim, {node_state::coords}); + for (int i = 0; i < num_new_nodes; i++) { + int node_gid = new_node_gids[i]; + auto it = node_gid_to_coords.find(node_gid); + if (it != node_gid_to_coords.end()) { + for (int d = 0; d < num_dim; d++) { + intermediate_node.coords.host(i, d) = it->second[d]; + } + } + } + intermediate_node.coords.update_device(); + + // Connectivity rebuild + intermediate_mesh.build_connectivity(); + MPI_Barrier(MPI_COMM_WORLD); + + CommunicationPlan element_communication_plan; + element_communication_plan.initialize(MPI_COMM_WORLD); + + CommunicationPlan node_communication_plan; + node_communication_plan.initialize(MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" Starting the ghost element and node construction"< gauss_pt_states = {gauss_pt_state::fields, gauss_pt_state::fields_vec}; + + gauss_point.initialize(final_mesh.num_elems, final_mesh.num_dims, gauss_pt_states, element_communication_plan); // , &element_communication_plan + + // Initialize the gauss point fields on each rank + // Set owned elements to rank number, ghost elements to -1 (to verify communication) + for (int i = 0; i < final_mesh.num_owned_elems; i++) { + gauss_point.fields.host(i) = static_cast(rank); + gauss_point.fields_vec.host(i, 0) = static_cast(rank); + gauss_point.fields_vec.host(i, 1) = static_cast(rank); + gauss_point.fields_vec.host(i, 2) = static_cast(rank); + } + for (int i = final_mesh.num_owned_elems; i < final_mesh.num_elems; i++) { + gauss_point.fields.host(i) = -1.0; // Ghost elements should be updated + gauss_point.fields_vec.host(i, 0) = -100.0; + gauss_point.fields_vec.host(i, 1) = -100.0; + gauss_point.fields_vec.host(i, 2) = -100.0; + } + gauss_point.fields.update_device(); + gauss_point.fields_vec.update_device(); + + MPI_Barrier(MPI_COMM_WORLD); + + gauss_point.fields.communicate(); + gauss_point.fields_vec.communicate(); + + MPI_Barrier(MPI_COMM_WORLD); + + CArrayKokkos tmp(final_mesh.num_elems); + + // Loop over all elements and average the values of elements connected to that element + FOR_ALL(i, 0, final_mesh.num_elems, { + double value = 0.0; + for (int j = 0; j < final_mesh.num_elems_in_elem(i); j++) { + value += gauss_point.fields(final_mesh.elems_in_elem(i, j)); + } + value /= final_mesh.num_elems_in_elem(i); + + tmp(i) = value; + + + value = 0.0; + for (int j = 0; j < final_mesh.num_elems_in_elem(i); j++) { + value += gauss_point.fields_vec(final_mesh.elems_in_elem(i, j), 0); + } + value /= final_mesh.num_elems_in_elem(i); + + gauss_point.fields_vec(i, 0) = value; + gauss_point.fields_vec(i, 1) = value; + gauss_point.fields_vec(i, 2) = value; + }); + MATAR_FENCE(); + + FOR_ALL(i, 0, final_mesh.num_elems, { + gauss_point.fields(i) = tmp(i); + }); + MATAR_FENCE(); + + gauss_point.fields.update_host(); + gauss_point.fields_vec.update_host(); + + + + // Test node communication using MPI_Neighbor_alltoallv + std::vector node_states = {node_state::coords, node_state::scalar_field, node_state::vector_field}; + final_node.initialize(final_mesh.num_nodes, 3, node_states, node_communication_plan); + + for (int i = 0; i < final_mesh.num_owned_nodes; i++) { + final_node.scalar_field.host(i) = static_cast(rank); + for(int dim = 0; dim < num_dim; dim++){ + final_node.vector_field.host(i, dim) = static_cast(rank); + } + } + for (int i = final_mesh.num_owned_nodes; i < final_mesh.num_nodes; i++) { + final_node.scalar_field.host(i) = -100.0; + for(int dim = 0; dim < num_dim; dim++){ + final_node.vector_field.host(i, dim) = -100; + } + } + + final_node.coords.update_device(); + final_node.scalar_field.update_device(); + final_node.vector_field.update_device(); + MATAR_FENCE(); + MPI_Barrier(MPI_COMM_WORLD); + + node_communication_plan.verify_graph_communicator(); + + final_node.scalar_field.communicate(); + final_node.vector_field.communicate(); + + MATAR_FENCE(); + MPI_Barrier(MPI_COMM_WORLD); + + DCArrayKokkos tmp_too(final_mesh.num_nodes); + for(int smooth = 0; smooth < 3; smooth++){ + FOR_ALL(i, 0, final_mesh.num_nodes, { + + double value = final_node.scalar_field(i); + for(int j = 0; j < final_mesh.num_nodes_in_node(i); j++){ + value += final_node.scalar_field(final_mesh.nodes_in_node(i, j)); + } + value /= final_mesh.num_nodes_in_node(i) + 1; + tmp_too(i) = value; + }); + MATAR_FENCE(); + + FOR_ALL(i, 0, final_mesh.num_nodes, { + final_node.scalar_field(i) = tmp_too(i); + for(int dim = 0; dim < num_dim; dim++){ + final_node.vector_field(i, dim) = tmp_too(i); + } + }); + MATAR_FENCE(); + } + + final_node.scalar_field.update_host(); + + MATAR_FENCE(); + MPI_Barrier(MPI_COMM_WORLD); +} + +#endif // DECOMP_UTILS_H \ No newline at end of file diff --git a/examples/mesh_decomp/install_ptscotch.sh b/examples/mesh_decomp/install_ptscotch.sh new file mode 100755 index 00000000..29d3f853 --- /dev/null +++ b/examples/mesh_decomp/install_ptscotch.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# Install script for Scotch and PT-Scotch +set -e + +# Configuration +LIB_DIR="lib" +# SCOTCH_VERSION="7.0.4" +# PTSCOTCH_VERSION="7.0.4" +# INSTALL_PREFIX="$(pwd)/${LIB_DIR}" + +# echo "Installing Scotch and PT-Scotch to ${INSTALL_PREFIX}" + +# Create lib directory if it doesn't exist +if [ ! -d "${LIB_DIR}" ]; then + mkdir -p "${LIB_DIR}" +fi +cd ${LIB_DIR} +# Clone and build Scotch +echo "Cloning Scotch..." +if [ -d "scotch" ]; then + rm -rf scotch +fi +git clone https://gitlab.inria.fr/scotch/scotch.git +cd scotch + +echo "Building Scotch..." +mkdir build +cd build +cmake .. -DCMAKE_BUILD_TYPE=Release \ + -DSCOTCH_MPI=ON \ + -DMPI_C_COMPILER=mpicc \ + -DMPI_Fortran_COMPILER=mpifort +make + +echo "Installation complete! Libraries installed in: ${INSTALL_PREFIX}" \ No newline at end of file diff --git a/examples/mesh_decomp/mesh.h b/examples/mesh_decomp/mesh.h new file mode 100644 index 00000000..01ad00c6 --- /dev/null +++ b/examples/mesh_decomp/mesh.h @@ -0,0 +1,1502 @@ +/********************************************************************************************** +� 2020. Triad National Security, LLC. All rights reserved. +This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos +National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S. +Department of Energy/National Nuclear Security Administration. All rights in the program are +reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear +Security Administration. The Government is granted for itself and others acting on its behalf a +nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare +derivative works, distribute copies to the public, perform publicly and display publicly, and +to permit others to do so. +This program is open source under the BSD-3 License. +Redistribution and use in source and binary forms, with or without modification, are permitted +provided that the following conditions are met: +1. Redistributions of source code must retain the above copyright notice, this list of +conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, this list of +conditions and the following disclaimer in the documentation and/or other materials +provided with the distribution. +3. Neither the name of the copyright holder nor the names of its contributors may be used +to endorse or promote products derived from this software without specific prior +written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************************/ +#ifndef MESH_H +#define MESH_H + +#include "matar.h" +#include "state.h" +#include + +#define PI 3.141592653589793 + +using namespace mtr; + +namespace mesh_init +{ +// element mesh types +enum elem_name_tag +{ + linear_simplex_element = 0, + linear_tensor_element = 1, + arbitrary_tensor_element = 2 +}; + +// other enums could go here on the mesh +} // end namespace + + +/* +========================== +Nodal indexing convention +========================== + + K + ^ J + | / + | / + | / + 6------------------7 + /| /| + / | / | + / | / | + / | / | + / | / | +4------------------5 | +| | | | ----> I +| | | | +| | | | +| | | | +| 2------------|-----3 +| / | / +| / | / +| / | / +| / | / +|/ |/ +0------------------1 + +nodes are ordered for outward normal +patch 0: [0,4,6,2] xi-minus dir +patch 1: [1,3,7,5] xi-plus dir +patch 2: [0,1,5,4] eta-minus dir +patch 3: [3,2,6,7] eta-plus dir +patch 4: [0,2,3,1] zeta-minus dir +patch 6: [4,5,7,6] zeta-plus dir +*/ + +// sort in ascending order using bubble sort +KOKKOS_INLINE_FUNCTION +void bubble_sort(size_t arr[], const size_t num) +{ + for (size_t i = 0; i < (num - 1); i++) { + for (size_t j = 0; j < (num - i - 1); j++) { + if (arr[j] > arr[j + 1]) { + size_t temp = arr[j]; + arr[j] = arr[j + 1]; + arr[j + 1] = temp; + } // end if + } // end for j + } // end for i +} // end function + +struct zones_in_elem_t +{ + private: + size_t num_zones_in_elem_; + public: + zones_in_elem_t() { + }; + + zones_in_elem_t(const size_t num_zones_in_elem_inp) { + this->num_zones_in_elem_ = num_zones_in_elem_inp; + }; + + // return global zone index for given local zone index in an element + size_t host(const size_t elem_gid, const size_t zone_lid) const + { + return elem_gid * num_zones_in_elem_ + zone_lid; + }; + + // Return the global zone ID given an element gloabl ID and a local zone ID + KOKKOS_INLINE_FUNCTION + size_t operator()(const size_t elem_gid, const size_t zone_lid) const + { + return elem_gid * num_zones_in_elem_ + zone_lid; + }; +}; + +// if material points are defined strictly internal to the element. +struct gauss_in_elem_t +{ + private: + size_t num_gauss_in_elem_; + public: + gauss_in_elem_t() { + }; + + gauss_in_elem_t(const size_t num_gauss_in_elem_inp) { + this->num_gauss_in_elem_ = num_gauss_in_elem_inp; + }; + + // return global gauss index for given local gauss index in an element + size_t host(const size_t elem_gid, const size_t leg_gauss_lid) const + { + return elem_gid * num_gauss_in_elem_ + leg_gauss_lid; + }; + + // Return the global gauss ID given an element gloabl ID and a local gauss ID + KOKKOS_INLINE_FUNCTION + size_t operator()(const size_t elem_gid, const size_t leg_gauss_lid) const + { + return elem_gid * num_gauss_in_elem_ + leg_gauss_lid; + }; +}; + +/// if material points are defined at element interfaces +struct lobatto_in_elem_t +{ + private: + size_t num_lobatto_in_elem_; + public: + lobatto_in_elem_t() { + }; + + lobatto_in_elem_t(const size_t num_lobatto_in_elem_inp) { + this->num_lobatto_in_elem_ = num_lobatto_in_elem_inp; + }; + + // return global gauss index for given local gauss index in an element + size_t host(const size_t elem_gid, const size_t lob_gauss_lid) const + { + return elem_gid * num_lobatto_in_elem_ + lob_gauss_lid; + }; + + // Return the global gauss ID given an element gloabl ID and a local gauss ID + KOKKOS_INLINE_FUNCTION + size_t operator()(const size_t elem_gid, const size_t lob_gauss_lid) const + { + return elem_gid * num_lobatto_in_elem_ + lob_gauss_lid; + }; +}; + +// struct nodes_in_zone_t { +// private: +// size_t num_nodes_in_zone_; +// public: +// nodes_in_zone_t(){}; + +// nodes_in_zone_t(const size_t num_nodes_in_zone_inp){ +// this->num_nodes_in_zone_ = num_nodes_in_zone_inp; +// }; + +// // return global zone index for given local zone index in an element +// size_t host(const size_t zone_gid, const size_t node_lid) const{ +// return zone_gid*num_nodes_in_zone_ + node_lid; +// }; + +// KOKKOS_INLINE_FUNCTION +// size_t operator()(const size_t zone_gid, const size_t node_lid) const{ +// return zone_gid*num_nodes_in_zone_ + node_lid; +// }; +// }; + +// mesh sizes and connectivity data structures +struct Mesh_t +{ + // ******* Entity Definitions **********// + // Element: A hexahedral volume + // Zone: A discretization of an element base on subdividing the element using the nodes + // Node: A kinematic degree of freedom + // Surface: The 2D surface of the element + // Patch: A discretization of a surface by subdividing the surface using the nodes + // Corner: A element-node pair + + bool verbose = false; + + // ---- Global Mesh Definitions ---- // + mesh_init::elem_name_tag elem_kind = mesh_init::linear_tensor_element; ///< The type of elements used in the mesh + + size_t Pn = 1; ///< Polynomial order of kinematic space + size_t num_dims = 3; ///< Number of spatial dimension + + // ---- Element Data Definitions ---- // + size_t num_elems; ///< Number of elements in the mesh + size_t num_nodes_in_elem; ///< Number of nodes in an element + size_t num_patches_in_elem; ///< Number of patches in an element + size_t num_surfs_in_elem; ///< Number of surfaces in an element + size_t num_zones_in_elem; ///< Number of zones in an element + + size_t num_gauss_in_elem; ///< Number of Gauss points in an element + size_t num_lobatto_in_elem; ///< Number of Gauss Lobatto points in an element + + DCArrayKokkos nodes_in_elem; ///< Nodes in an element + CArrayKokkos corners_in_elem; ///< Corners in an element -- this can just be a functor + + RaggedRightArrayKokkos elems_in_elem; ///< Elements connected to an element + CArrayKokkos num_elems_in_elem; ///< Number of elements connected to an element + + CArrayKokkos patches_in_elem; ///< Patches in an element (including internal patches) + CArrayKokkos surfs_in_elem; ///< Surfaces on an element + + // CArrayKokkos zones_in_elem; ///< Zones in an element + zones_in_elem_t zones_in_elem; ///< Zones in an element + lobatto_in_elem_t lobatto_in_elem; ///< Gauss Lobatto points in an element + gauss_in_elem_t gauss_in_elem; ///< Gauss points in an element + + // ---- Node Data Definitions ---- // + size_t num_nodes; ///< Number of nodes in the mesh + + RaggedRightArrayKokkos corners_in_node; ///< Corners connected to a node + CArrayKokkos num_corners_in_node; ///< Number of corners connected to a node + RaggedRightArrayKokkos elems_in_node; ///< Elements connected to a given node + RaggedRightArrayKokkos nodes_in_node; ///< Nodes connected to a node along an edge + CArrayKokkos num_nodes_in_node; ///< Number of nodes connected to a node along an edge + + // ---- Surface Data Definitions ---- // + size_t num_surfs; ///< Number of surfaces in the mesh + size_t num_nodes_in_surf; ///< Number of nodes in a surface + size_t num_patches_in_surf; ///< Number of patches in a surface + + CArrayKokkos patches_in_surf; ///< Patches in a surface + CArrayKokkos nodes_in_surf; ///< Nodes connected to a surface + CArrayKokkos elems_in_surf; ///< Elements connected to a surface + + // ---- Patch Data Definitions ---- // + size_t num_patches; ///< Number of patches in the mesh + size_t num_nodes_in_patch; ///< Number of nodes in a patch + // size_t num_lobatto_in_patch; ///< Number of Gauss Lobatto nodes in a patch + // size_t num_gauss_in_patch; ///< Number of Gauss nodes in a patch + + CArrayKokkos nodes_in_patch; ///< Nodes connected to a patch + CArrayKokkos elems_in_patch; ///< Elements connected to a patch + CArrayKokkos surf_in_patch; ///< Surfaces connected to a patch (co-planar) + + // ---- Corner Data Definitions ---- // + size_t num_corners; ///< Number of corners (define) in the mesh + + // ---- Zone Data Definitions ---- // + size_t num_zones; ///< Number of zones in the mesh + size_t num_nodes_in_zone; ///< Number of nodes in a zone + + CArrayKokkos nodes_in_zone; ///< Nodes defining a zone + // nodes_in_zone_t nodes_in_zone; + + // ---- Boundary Data Definitions ---- // + size_t num_bdy_sets; ///< Number of boundary sets + size_t num_bdy_nodes; ///< Number of boundary nodes + size_t num_bdy_patches; ///< Number of boundary patches + + CArrayKokkos bdy_patches; ///< Boundary patches + CArrayKokkos bdy_nodes; ///< Boundary nodes + + RaggedRightArrayKokkos bdy_patches_in_set; ///< Boundary patches in a boundary set + DCArrayKokkos num_bdy_patches_in_set; ///< Number of boundary nodes in a set + + RaggedRightArrayKokkos bdy_nodes_in_set; ///< Boundary nodes in a boundary set + DCArrayKokkos num_bdy_nodes_in_set; ///< Number of boundary nodes in a set + + + // MPI Decomposition Data Definitions ---- // + DCArrayKokkos local_to_global_node_mapping; ///< Local to global node mapping + DCArrayKokkos local_to_global_elem_mapping; ///< Local to global element mapping + + // Element communicaiton data definitions + size_t num_owned_elems; ///< Number of owned elements on this rank + size_t num_boundary_elems; ///< Number of boundary elements on this rank (send data to neighboring MPI ranks) + DCArrayKokkos boundary_elem_local_ids; ///< Local IDs of boundary elements on this rank (send data to neighboring MPI ranks) + size_t num_ghost_elems; ///< Number of ghost elements on this rank (receive data from neighboring MPI ranks) + + // Node communicaiton data definitions + size_t num_owned_nodes; ///< Number of owned nodes on this rank + size_t num_boundary_nodes; ///< Number of boundary nodes on this rank (send data to neighboring MPI ranks) + DCArrayKokkos boundary_node_local_ids; ///< Local IDs of boundary nodes on this rank (send data to neighboring MPI ranks) + size_t num_ghost_nodes; ///< Number of ghost nodes on this rank (receive data from neighboring MPI ranks) + + + + + + // initialization methods + void initialize_nodes(const size_t num_nodes_inp) + { + num_nodes = num_nodes_inp; + return; + }; // end method + + // initialization methods + void initialize_elems(const size_t num_elems_inp, const size_t num_dims_inp) + { + num_dims = num_dims_inp; + num_nodes_in_elem = 1; + + for (int dim = 0; dim < num_dims; dim++) { + num_nodes_in_elem *= 2; + } + num_elems = num_elems_inp; + nodes_in_elem = DCArrayKokkos(num_elems, num_nodes_in_elem, "mesh.nodes_in_elem"); + corners_in_elem = CArrayKokkos(num_elems, num_nodes_in_elem, "mesh.corners_in_elem"); + + // 1 Gauss point per element + num_gauss_in_elem = 1; + + // 1 zone per element + num_zones_in_elem = 1; + + gauss_in_elem = gauss_in_elem_t(num_gauss_in_elem); + + return; + }; // end method + + // initialization method + void initialize_elems_Pn(const size_t num_elems_inp, + const size_t num_nodes_in_elem_inp, + const size_t num_gauss_leg_in_elem_inp, + const size_t num_zones_in_elem_inp, + const size_t num_nodes_in_zone_inp, + const size_t num_surfs_in_elem_inp, + const size_t num_dims_inp) + { + num_dims = num_dims_inp; + num_elems = num_elems_inp; + + num_nodes_in_elem = num_nodes_in_elem_inp; + num_nodes_in_zone = num_nodes_in_zone_inp; + num_gauss_in_elem = num_gauss_leg_in_elem_inp; + num_zones_in_elem = num_zones_in_elem_inp; + num_surfs_in_elem = num_surfs_in_elem_inp; + + num_zones = num_zones_in_elem * num_elems; + + nodes_in_elem = DCArrayKokkos(num_elems, num_nodes_in_elem, "mesh.nodes_in_elem"); + corners_in_elem = CArrayKokkos(num_elems, num_nodes_in_elem, "mesh.corners_in_elem"); + zones_in_elem = zones_in_elem_t(num_zones_in_elem); + surfs_in_elem = CArrayKokkos(num_elems, num_surfs_in_elem, "mesh.surfs_in_zone"); + nodes_in_zone = CArrayKokkos(num_zones, num_nodes_in_zone, "mesh.nodes_in_zone"); + gauss_in_elem = gauss_in_elem_t(num_gauss_in_elem); + + return; + }; // end method + + // initialization methods + void initialize_corners(const size_t num_corners_inp) + { + num_corners = num_corners_inp; + + return; + }; // end method + + // build the corner mesh connectivity arrays + void build_corner_connectivity() + { + num_corners_in_node = CArrayKokkos(num_nodes, "mesh.num_corners_in_node"); // stride sizes + + // initializing the number of corners (node-cell pair) to be zero + FOR_ALL_CLASS(node_gid, 0, num_nodes, { + num_corners_in_node(node_gid) = 0; + }); + + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { + FOR_ALL_CLASS(node_lid, 0, num_nodes_in_elem, { + // get the global_id of the node + size_t node_gid = nodes_in_elem(elem_gid, node_lid); + + // increment the number of corners attached to this point + num_corners_in_node(node_gid) = num_corners_in_node(node_gid) + 1; + }); // end FOR_ALL over nodes in element + } // end for elem_gid + + // the stride sizes are the num_corners_in_node at the node + corners_in_node = RaggedRightArrayKokkos(num_corners_in_node, "mesh.corners_in_node"); + + CArrayKokkos count_saved_corners_in_node(num_nodes, "count_saved_corners_in_node"); + + // reset num_corners to zero + FOR_ALL_CLASS(node_gid, 0, num_nodes, { + count_saved_corners_in_node(node_gid) = 0; + }); + + // the elems_in_elem data type + elems_in_node = RaggedRightArrayKokkos(num_corners_in_node, "mesh.elems_in_node"); + + // populate the elements connected to a node list and corners in a node + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { + FOR_ALL_CLASS(node_lid, 0, num_nodes_in_elem, { + // get the global_id of the node + size_t node_gid = nodes_in_elem(elem_gid, node_lid); + + // the column index is the num corners saved + size_t j = count_saved_corners_in_node(node_gid); + + // Save corner index to this node_gid + size_t corner_gid = node_lid + elem_gid * num_nodes_in_elem; // this can be a functor + corners_in_node(node_gid, j) = corner_gid; + + elems_in_node(node_gid, j) = elem_gid; // save the elem_gid + + // Save corner index to element + size_t corner_lid = node_lid; + corners_in_elem(elem_gid, corner_lid) = corner_gid; + + // increment the number of corners saved to this node_gid + count_saved_corners_in_node(node_gid) = count_saved_corners_in_node(node_gid) + 1; + }); // end FOR_ALL over nodes in element + } // end for elem_gid + + return; + } // end of build_corner_connectivity + + // build elem connectivity arrays + void build_elem_elem_connectivity() + { + // find the max number of elems around a node + size_t max_num_elems_in_node; + size_t max_num_lcl; + FOR_REDUCE_MAX_CLASS(node_gid, 0, num_nodes, max_num_lcl, { + // num_corners_in_node = num_elems_in_node + size_t max_num = num_corners_in_node(node_gid); + + if (max_num > max_num_lcl) { + max_num_lcl = max_num; + } + }, max_num_elems_in_node); // end parallel reduction on max + Kokkos::fence(); + + // a temporary ragged array to save the elems around an elem + DynamicRaggedRightArrayKokkos temp_elems_in_elem(num_nodes, num_nodes_in_elem * max_num_elems_in_node, "temp_elems_in_elem"); + + num_elems_in_elem = CArrayKokkos(num_elems, "mesh.num_elems_in_elem"); + FOR_ALL_CLASS(elem_gid, 0, num_elems, { + num_elems_in_elem(elem_gid) = 0; + }); + Kokkos::fence(); + + // find and save neighboring elem_gids of an elem + FOR_ALL_CLASS(elem_gid, 0, num_elems, { + for (int node_lid = 0; node_lid < num_nodes_in_elem; node_lid++) { + // get the gid for the node + size_t node_id = nodes_in_elem(elem_gid, node_lid); + + // loop over all elems connected to node_gid + for (int elem_lid = 0; elem_lid < num_corners_in_node(node_id); elem_lid++) { + // get the global id for the neighboring elem + size_t neighbor_elem_gid = elems_in_node(node_id, elem_lid); + + // a flag to save (=1) or not (=0) + size_t save = 1; + + // a true neighbor_elem_id is not equal to elem_gid + if (neighbor_elem_gid == elem_gid) { + save = 0; // don't save + } // end if + + // check to see if the neighbor_elem_gid has been saved already + size_t num_saved = temp_elems_in_elem.stride(elem_gid); + for (size_t i = 0; i < num_saved; i++) { + if (neighbor_elem_gid == temp_elems_in_elem(elem_gid, i)) { + save = 0; // don't save, it has been saved already + } // end if + } // end for i + + if (save == 1) { + // increment the number of neighboring elements saved + temp_elems_in_elem.stride(elem_gid)++; + + // save the neighboring elem_gid + temp_elems_in_elem(elem_gid, num_saved) = neighbor_elem_gid; + } // end if save + } // end for elem_lid in a node + } // end for node_lid in an elem + + // save the actial stride size + num_elems_in_elem(elem_gid) = temp_elems_in_elem.stride(elem_gid); + }); // end FOR_ALL elems + Kokkos::fence(); + + // compress out the extra space in the temp_elems_in_elem + elems_in_elem = RaggedRightArrayKokkos(num_elems_in_elem, "mesh.elems_in_elem"); + + FOR_ALL_CLASS(elem_gid, 0, num_elems, { + for (size_t i = 0; i < num_elems_in_elem(elem_gid); i++) { + elems_in_elem(elem_gid, i) = temp_elems_in_elem(elem_gid, i); + } // end for i + }); // end FOR_ALL elems + Kokkos::fence(); + + return; + } // end of build_elem_elem_connectivity + + // build the patches + void build_patch_connectivity() + { + // WARNING WARNING + // the mesh element kind should be in the input file and set when reading mesh + // mesh_elem_kind = mesh_init::linear_tensor_element; // MUST BE SET + + // building patches + + num_nodes_in_patch = 2 * (num_dims - 1); // 2 (2D) or 4 (3D) + num_surfs_in_elem = 2 * num_dims; // 4 (2D) or 6 (3D) + + // num_lobatto_in_patch = int(pow(3, num_dims-1)); + + // num_gauss_in_patch = 2*(num_dims-1); + + size_t num_patches_in_surf; // = Pn_order or = Pn_order*Pn_order + + size_t num_1D = Pn + 1; // number of nodes in 1D + + // num quad points 1D // + // size_t num_lob_1D = 2*Pn + 1; + // size_t num_1D = 2*Pn; + + DCArrayKokkos node_ordering_in_elem; // dimensions will be (num_patches_in_elem, num_nodes_in_patch); + + // DCArrayKokkos lobatto_ordering_in_elem; // dimensions will be (num_patches_in_elem, num_lobatto_in_patch); + + // DCArrayKokkos gauss_ordering_in_elem; // dimensions will be (num_patches_in_elem, num_gauss_in_patch); + + if (verbose) printf("Number of dimensions = %zu \n", num_dims); + + if (num_dims == 3) { + // num_patches_in_surf = [1^2, 2^2, 3^2, 4^2, ... , Pn^2] + + num_patches_in_surf = Pn * Pn; + + num_patches_in_elem = num_patches_in_surf * num_surfs_in_elem; + + // nodes in a patch in the element + node_ordering_in_elem = DCArrayKokkos(num_patches_in_elem, num_nodes_in_patch, "node_ordering_in_elem"); + + // lobatto_ordering_in_elem = DCArrayKokkos (num_patches_in_elem, num_lobatto_in_patch); + + // gauss_ordering_in_elem = DCArrayKokkos (num_patches_in_elem, num_gauss_in_patch); + + // printf("num_patches_in_elem = %zu \n", num_patches_in_elem); + // printf("num_nodes_in_patch = %zu \n", num_nodes_in_patch); + // printf("num_lobatto_in_patch = %zu \n", num_lobatto_in_patch); + // printf("num_gauss_in_patch = %zu \n", num_gauss_in_patch); + // printf("Number of surfaces = %zu \n", num_surfs_in_elem); + } + else { + num_patches_in_surf = Pn; + + num_patches_in_elem = num_patches_in_surf * num_surfs_in_elem; + + // nodes in a patch in the element + node_ordering_in_elem = DCArrayKokkos(num_patches_in_elem, num_nodes_in_patch, "node_ordering_in_elem"); + // lobatto_ordering_in_elem = DCArrayKokkos (num_patches_in_elem, num_lobatto_in_patch); + // gauss_ordering_in_elem = DCArrayKokkos (num_patches_in_elem, num_gauss_in_patch); + } // end if dim + + // On the CPU, set the node order for the patches in an element + // classic linear elements + if (elem_kind == mesh_init::linear_tensor_element) { + if (num_dims == 3) { + + size_t temp_node_lids[24] = { 0, 4, 6, 2, + 1, 3, 7, 5, + 0, 1, 5, 4, + 3, 2, 6, 7, + 0, 2, 3, 1, + 4, 5, 7, 6 }; + + int count = 0; + int elem_patch_lid = 0; + for (size_t surf_lid = 0; surf_lid < num_surfs_in_elem; surf_lid++) { + for (size_t patch_lid = 0; patch_lid < num_patches_in_surf; patch_lid++) { + for (size_t node_lid = 0; node_lid < num_nodes_in_patch; node_lid++) { + node_ordering_in_elem.host(elem_patch_lid, node_lid) = temp_node_lids[count]; + // gauss_ordering_in_elem.host( elem_patch_lid, node_lid ) = temp_node_lids[count]; + count++; + } // end for node_lid + elem_patch_lid++; + } // end for patch_lid in a surface + } // end for i + + // count = 0; + // elem_patch_lid = 0; + // for ( size_t surf_lid=0; surf_lid < num_surfs_in_elem; surf_lid++ ){ + // for ( size_t patch_lid=0; patch_lid < num_patches_in_surf; patch_lid++ ){ + // for ( size_t lobatto_lid=0; lobatto_lid < num_lobatto_in_patch; lobatto_lid++ ){ + // lobatto_ordering_in_elem.host( elem_patch_lid, lobatto_lid ) = temp_node_lids[count]; + // count++; + // } // end for node_lid + // elem_patch_lid ++; + // } // end for patch_lid in a surface + // } // end for i + } + else { + // J + // | + // 3---2 + // | | -- I + // 0---1 + // + size_t temp_node_lids[8] = + { 0, 3, + 1, 2, + 0, 1, + 3, 2 }; + + int count = 0; + int elem_patch_lid = 0; + for (size_t surf_lid = 0; surf_lid < num_surfs_in_elem; surf_lid++) { + for (size_t patch_lid = 0; patch_lid < num_patches_in_surf; patch_lid++) { + for (size_t node_lid = 0; node_lid < num_nodes_in_patch; node_lid++) { + node_ordering_in_elem.host(elem_patch_lid, node_lid) = temp_node_lids[count]; + // gauss_ordering_in_elem.host( elem_patch_lid, node_lid ) = temp_node_lids[count]; + count++; + } // end for node_lid + elem_patch_lid++; + } // end for patch_lid in a surface + } // end for i + } // end if on dims + } // end of linear element iwth classic numbering + // ----- + // arbitrary-order element + // ----- + else if (elem_kind == mesh_init::arbitrary_tensor_element) { + size_t temp_node_lids[num_nodes_in_patch * num_patches_in_surf * num_surfs_in_elem]; + + printf("arbitrary order tensor element \n"); + + // arbitrary-order node ordering in patches of an element + if (num_dims == 3) { + /* + + i,j,k layout + + k j + | / + |/ + o-->i + + + i=0,imax + o (j+1,k+1) + /| + (j,k+1) o o (j+1,k) + |/ + (j,k) o + + */ + + int count = 0; + + int i_patch, j_patch, k_patch; + + // i-minus-dir patches + + i_patch = 0; + for (int k = 0; k < num_1D - 1; k++) { + for (int j = 0; j < num_1D - 1; j++) { + // node_lid 0 in patch + // index = i + j*num_1D + k*num_1D*num_1D; + temp_node_lids[count] = i_patch + j * num_1D + k * num_1D * num_1D; // node_rid(i_patch, j, k, num_1D); + count++; + + // node_lid 1 in patch + // index = i + j*num_1D + (k+1)*num_1D*num_1D; + temp_node_lids[count] = i_patch + j * num_1D + (k + 1) * num_1D * num_1D; // node_rid(i_patch, j, k+1, num_1D); + count++; + + // node_lid 2 in patch + // index = i + (j+1)*num_1D + (k+1)*num_1D*num_1D; + temp_node_lids[count] = i_patch + (j + 1) * num_1D + (k + 1) * num_1D * num_1D; // node_rid(i_patch, j+1, k+1, num_1D); + count++; + + // node_lid 3 in patch + // index = i + (j+1)*num_1D + k*num_1D*num_1D; + temp_node_lids[count] = i_patch + (j + 1) * num_1D + k * num_1D * num_1D; // node_rid(i_patch, j+1, k, num_1D); + count++; + } // end for k + } // end for j + + // printf("i-minus\n"); + + // i-plus-dir patches + i_patch = num_1D - 1; + // printf("num_1D = %zu \n", num_1D); + // printf("i_patch = %d \n", i_patch); + printf("num_nodes_in_elem %zu \n", num_nodes_in_elem); + for (int k = 0; k < num_1D - 1; k++) { + for (int j = 0; j < num_1D - 1; j++) { + // node_lid 0 in patch + // index = i + j*num_1D + k*num_1D*num_1D; + temp_node_lids[count] = i_patch + j * num_1D + k * num_1D * num_1D; // node_rid(i_patch, j, k, num_1D); + count++; + + // node_lid 1 in patch + // index = i + (j+1)*num_1D + k*num_1D*num_1D; + temp_node_lids[count] = i_patch + (j + 1) * num_1D + k * num_1D * num_1D; // node_rid(i_patch, j+1, k, num_1D); + count++; + + // node_lid 2 in patch + // index = i + (j+1)*num_1D + (k+1)*num_1D*num_1D; + temp_node_lids[count] = i_patch + (j + 1) * num_1D + (k + 1) * num_1D * num_1D; // node_rid(i_patch, j+1, k+1, num_1D); + count++; + + // node_lid 3 in patch + // index = i + j*num_1D + (k+1)*num_1D*num_1D; + temp_node_lids[count] = i_patch + j * num_1D + (k + 1) * num_1D * num_1D; // node_rid(i_patch, j, k+1, num_1D); + count++; + } // end for j + } // end for k + + // printf("i-plus\n"); + + /* + + i,j,k layout + + k j + | / + |/ + o-->i + + + j=0,jmax + + (i,,k+1) o--o (i+1,,k+1) + | | + (i,,k) o--o (i+1,,k) + + */ + + j_patch = 0; + for (int k = 0; k < num_1D - 1; k++) { + for (int i = 0; i < num_1D - 1; i++) { + // node_lid 0 in patch + temp_node_lids[count] = i + j_patch * num_1D + k * num_1D * num_1D; // node_rid(i, j_patch, k, num_1D); + count++; + + // node_lid 1 in patch + temp_node_lids[count] = i + 1 + j_patch * num_1D + k * num_1D * num_1D; // node_rid(i+1, j_patch, k, num_1D); + count++; + + // node_lid 2 in patch + temp_node_lids[count] = i + 1 + j_patch * num_1D + (k + 1) * num_1D * num_1D; // node_rid(i+1, j_patch, k+1, num_1D); + count++; + + // node_lid 3 in patch + temp_node_lids[count] = i + j_patch * num_1D + (k + 1) * num_1D * num_1D; // node_rid(i, j_patch, k+1, num_1D); + count++; + } // end for i + } // end for k + + // printf("j-minus\n"); + + j_patch = num_1D - 1; + for (int k = 0; k < num_1D - 1; k++) { + for (int i = 0; i < num_1D - 1; i++) { + // node_lid 0 in patch + temp_node_lids[count] = i + j_patch * num_1D + k * num_1D * num_1D; // node_rid(i, j_patch, k, num_1D); + count++; + + // node_lid 1 in patch + temp_node_lids[count] = i + j_patch * num_1D + (k + 1) * num_1D * num_1D; // node_rid(i, j_patch, k+1, num_1D); + count++; + + // node_lid 2 in patch + temp_node_lids[count] = i + 1 + j_patch * num_1D + (k + 1) * num_1D * num_1D; // node_rid(i+1, j_patch, k+1, num_1D); + count++; + + // node_lid 3 in patch + temp_node_lids[count] = i + 1 + j_patch * num_1D + k * num_1D * num_1D; // node_rid(i+1, j_patch, k, num_1D); + count++; + } // end for i + } // end for k + + // printf("j-plus\n"); + + /* + + i,j,k layout + + k j + | / + |/ + o-->i + + + k=0,kmax + + (i,j+1) o--o (i+1,j+1) + / / + (i,j) o--o (i+1,j) + + */ + + k_patch = 0; + for (int j = 0; j < num_1D - 1; j++) { + for (int i = 0; i < num_1D - 1; i++) { + // node_lid 0 in patch + temp_node_lids[count] = i + j * num_1D + k_patch * num_1D * num_1D; // node_rid(i, j, k_patch, num_1D); + count++; + + // node_lid 1 in patch + temp_node_lids[count] = i + (j + 1) * num_1D + k_patch * num_1D * num_1D; // node_rid(i, j+1, k_patch, num_1D); + count++; + + // node_lid 2 in patch + temp_node_lids[count] = i + 1 + (j + 1) * num_1D + k_patch * num_1D * num_1D; // node_rid(i+1, j+1, k_patch, num_1D); + count++; + + // node_lid 3 in patch + temp_node_lids[count] = i + 1 + j * num_1D + k_patch * num_1D * num_1D; // node_rid(i+1, j, k_patch, num_1D); + count++; + } // end for i + } // end for j + // printf("k-minus\n"); + + k_patch = num_1D - 1; + for (int j = 0; j < num_1D - 1; j++) { + for (int i = 0; i < num_1D - 1; i++) { + // node_lid 0 in patch + temp_node_lids[count] = i + j * num_1D + k_patch * num_1D * num_1D; // node_rid(i, j, k_patch, num_1D); + count++; + + // node_lid 1 in patch + temp_node_lids[count] = i + 1 + j * num_1D + k_patch * num_1D * num_1D; // node_rid(i+1, j, k_patch, num_1D); + count++; + + // node_lid 2 in patch + temp_node_lids[count] = i + 1 + (j + 1) * num_1D + k_patch * num_1D * num_1D; // node_rid(i+1, j+1, k_patch, num_1D); + count++; + + // node_lid 3 in patch + temp_node_lids[count] = i + (j + 1) * num_1D + k_patch * num_1D * num_1D; // node_rid(i, j+1, k_patch, num_1D); + count++; + } // end for i + } // end for j + + // printf("k-plus\n"); + + count = 0; + int elem_patch_lid = 0; + for (size_t surf_lid = 0; surf_lid < 6; surf_lid++) { + for (size_t patch_lid = 0; patch_lid < num_patches_in_surf; patch_lid++) { + for (size_t node_lid = 0; node_lid < 4; node_lid++) { + node_ordering_in_elem.host(elem_patch_lid, node_lid) = temp_node_lids[count]; + count++; + } // end for node_lid + elem_patch_lid++; + } // end for patch_lid in a surface + } // end for i + } // end if 3D + // + else{ + // 2D arbitrary order elements + int count = 0; + int i_patch, j_patch; + + // i-minus-dir patches + + i_patch = 0; + for (int j = 0; j < num_1D - 1; j++) { + temp_node_lids[count] = i_patch + j * num_1D; // node_rid(i_patch, j, num_1D; + count++; + + temp_node_lids[count] = i_patch + (j + 1) * num_1D; // node_rid(i_patch, j+1, num_1D; + count++; + } // end for j + + // i-plus-dir patches + i_patch = num_1D - 1; + for (int j = 0; j < num_1D - 1; j++) { + temp_node_lids[count] = i_patch + j * num_1D; // node_rid(i_patch, j, num_1D; + count++; + + temp_node_lids[count] = i_patch + (j + 1) * num_1D; // node_rid(i_patch, j+1, num_1D; + count++; + } // end for j + + j_patch = 0; + for (int i = 0; i < num_1D - 1; i++) { + temp_node_lids[count] = i + j_patch * num_1D; // node_rid(i, j_patch, num_1D); + count++; + + temp_node_lids[count] = i + 1 + j_patch * num_1D; // node_rid(i+1, j_patch, num_1D); + count++; + } // end for i + + j_patch = num_1D - 1; + for (int i = 0; i < num_1D - 1; i++) { + temp_node_lids[count] = i + j_patch * num_1D; // node_rid(i, j_patch, num_1D); + count++; + + temp_node_lids[count] = i + 1 + j_patch * num_1D; // node_rid(i+1, j_patch, num_1D); + count++; + } // end for i + + count = 0; + int elem_patch_lid = 0; + for (size_t surf_lid = 0; surf_lid < num_surfs_in_elem; surf_lid++) { + for (size_t patch_lid = 0; patch_lid < num_patches_in_surf; patch_lid++) { + for (size_t node_lid = 0; node_lid < num_nodes_in_patch; node_lid++) { + node_ordering_in_elem.host(elem_patch_lid, node_lid) = temp_node_lids[count]; + count++; + } // end for node_lid + elem_patch_lid++; + } // end for patch_lid in a surface + } // end for i + } // end else on dim + + // build zones in high order element + FOR_ALL_CLASS(elem_gid, 0, num_elems, { + size_t node_lids[8]; // temp storage for local node ids + for (int k = 0; k < num_1D - 1; k++) { + for (int j = 0; j < num_1D - 1; j++) { + for (int i = 0; i < num_1D - 1; i++) { + node_lids[0] = i + j * (num_1D) + k * (num_1D) * (num_1D); // i,j,k + node_lids[1] = i + 1 + j * (num_1D) + k * (num_1D) * (num_1D); // i+1, j, k + node_lids[2] = i + (j + 1) * (num_1D) + k * (num_1D) * (num_1D); // i,j+1,k + node_lids[3] = i + 1 + (j + 1) * (num_1D) + k * (num_1D) * (num_1D); // i+1, j+1, k + node_lids[4] = i + j * (num_1D) + (k + 1) * (num_1D) * (num_1D); // i, j , k+1 + node_lids[5] = i + 1 + j * (num_1D) + (k + 1) * (num_1D) * (num_1D); // i + 1, j , k+1 + node_lids[6] = i + (j + 1) * (num_1D) + (k + 1) * (num_1D) * (num_1D); // i,j+1,k+1 + node_lids[7] = i + 1 + (j + 1) * (num_1D) + (k + 1) * (num_1D) * (num_1D); // i+1, j+1, k+1 + + size_t zone_lid = i + j * (num_1D - 1) + k * (num_1D - 1) * (num_1D - 1); + size_t zone_gid = zones_in_elem(elem_gid, zone_lid); + + for (int node_lid = 0; node_lid < 8; node_lid++) { + // get global id for the node + size_t node_gid = nodes_in_elem(elem_gid, node_lids[node_lid]); + nodes_in_zone(zone_gid, node_lid) = node_gid; + } + } // i + } // j + } // k + }); // end FOR_ALL elem_gid + } // end if arbitrary-order element + else { + printf("\nERROR: mesh type is not known \n"); + } // end if + + // update the device + node_ordering_in_elem.update_device(); + Kokkos::fence(); + + if (verbose) printf("Built node ordering \n"); + + // for saving the hash keys of the patches and then the neighboring elem_gid + CArrayKokkos hash_keys_in_elem(num_elems, num_patches_in_elem, num_nodes_in_patch, "hash_keys_in_elem"); // always 4 ids in 3D + + // for saving the adjacent patch_lid, which is the slide_lid + // CArrayKokkos neighboring_side_lids (num_elems, num_patches_in_elem); + + // allocate memory for the patches in the elem + patches_in_elem = CArrayKokkos(num_elems, num_patches_in_elem, "mesh.patches_in_elem"); + + // a temporary storage for the patch_gids that are on the mesh boundary + CArrayKokkos temp_bdy_patches(num_elems * num_patches_in_elem, "temp_bdy_patches"); + + // step 1) calculate the hash values for each patch in the element + FOR_ALL_CLASS(elem_gid, 0, num_elems, { + for (size_t patch_lid = 0; patch_lid < num_patches_in_elem; patch_lid++) { + size_t sorted_patch_nodes[4]; // note: cannot be allocated with num_nodes_in_patch + + // first save the patch nodes + for (size_t patch_node_lid = 0; patch_node_lid < num_nodes_in_patch; patch_node_lid++) { + // get the local node index of the element for this patch and node in patch + size_t node_lid = node_ordering_in_elem(patch_lid, patch_node_lid); + + // get and save the global index of the node + sorted_patch_nodes[patch_node_lid] = nodes_in_elem(elem_gid, node_lid); + } // end for node_lid + + // sort nodes from smallest to largest + bubble_sort(sorted_patch_nodes, num_nodes_in_patch); + + // save hash_keys in the this elem + for (size_t key_lid = 0; key_lid < num_nodes_in_patch; key_lid++) { + hash_keys_in_elem(elem_gid, patch_lid, key_lid) = sorted_patch_nodes[key_lid]; // 4 node values are keys + } // for + } // end for patch_lid + }); // end FOR_ALL elem_gid + + DCArrayKokkos num_values(2, "num_values"); + + // 8x8x8 mesh + // num_patches = 8*8*9*3 = 1728 + // bdy_patches = 8*8*6 = 384 + // + + // step 2: walk around the elements and save the elem pairs that have the same hash_key + RUN_CLASS({ + // serial execution on GPU + + size_t patch_gid = 0; + size_t bdy_patch_gid = 0; + + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { + // loop over the patches in this elem + for (size_t patch_lid = 0; patch_lid < num_patches_in_elem; patch_lid++) { + size_t exit = 0; + + // negative values mean the patch has not been saved + if (hash_keys_in_elem(elem_gid, patch_lid, 0) >= 0) { + // find the nighboring patch with the same hash_key + + for (size_t neighbor_elem_lid = 0; neighbor_elem_lid < num_elems_in_elem(elem_gid); neighbor_elem_lid++) { + // get the neighboring element global index + size_t neighbor_elem_gid = elems_in_elem(elem_gid, neighbor_elem_lid); + + for (size_t neighbor_patch_lid = 0; neighbor_patch_lid < num_patches_in_elem; neighbor_patch_lid++) { + size_t save_it = 0; + for (size_t key_lid = 0; key_lid < num_nodes_in_patch; key_lid++) { + if (hash_keys_in_elem(neighbor_elem_gid, neighbor_patch_lid, key_lid) == hash_keys_in_elem(elem_gid, patch_lid, key_lid)) { + save_it++; // if save_it == num_nodes after this loop, then it is a match + } + } // end key loop + + // this hash is from the nodes on the patch + if (save_it == num_nodes_in_patch) { + // make it negative, because we saved it + hash_keys_in_elem(elem_gid, patch_lid, 0) = -1; + hash_keys_in_elem(neighbor_elem_gid, neighbor_patch_lid, 0) = -1; + + // save the patch_lids for the adjacent sides + // neighboring_side_lids(elem_gid, patch_lid) = neighbor_patch_lid; + // neighboring_side_lids(neighbor_elem_gid, neighbor_patch_lid) = patch_lid; + + // save the patch_gid + patches_in_elem(elem_gid, patch_lid) = patch_gid; + patches_in_elem(neighbor_elem_gid, neighbor_patch_lid) = patch_gid; + + patch_gid++; + + exit = 1; + break; + } // end if + } // end for loop over a neighbors patch set + + if (exit == 1) { + break; + } + } // end for loop over elem neighbors + } // end if hash<0 + } // end for patch_lid + + // loop over the patches in this element again + // remaining positive hash key values are the boundary patches + for (size_t patch_lid = 0; patch_lid < num_patches_in_elem; patch_lid++) { + if (hash_keys_in_elem(elem_gid, patch_lid, 0) >= 0) { + hash_keys_in_elem(elem_gid, patch_lid, 0) = -1; // make it negative, because we saved it + + // neighboring_side_lids(elem_gid, patch_lid) = patch_lid; + + patches_in_elem(elem_gid, patch_lid) = patch_gid; + temp_bdy_patches(bdy_patch_gid) = patch_gid; + + patch_gid++; + bdy_patch_gid++; + } // end if + } // end for over patch_lid + } // end for over elem_gid + + // the num_values is because the values passed in are const, so a const pointer is needed + num_values(0) = patch_gid; // num_patches = patch_gid; + num_values(1) = bdy_patch_gid; // num_bdy_patches = bdy_patch_gid; + }); // end RUN + Kokkos::fence(); + + num_values.update_host(); + Kokkos::fence(); + + num_patches = num_values.host(0); + // this lines assumes num_surfs == num_patches, only valid for 1st order elements + num_surfs = num_values.host(0); + num_bdy_patches = num_values.host(1); + + // size_t mesh_1D = 60; + // size_t exact_num_patches = (mesh_1D*mesh_1D)*(mesh_1D+1)*3; + // size_t exact_num_bdy_patches = (mesh_1D*mesh_1D)*6; + // printf("num_patches = %lu, exact = %lu \n", num_patches, exact_num_patches); + // printf("num_bdy_patches = %lu exact = %lu \n", num_bdy_patches, exact_num_bdy_patches); + + // printf("Num patches = %lu \n", num_patches); + // printf("Num boundary patches = %lu \n", num_bdy_patches); + + elems_in_patch = CArrayKokkos(num_patches, 2, "mesh.elems_in_patch"); + nodes_in_patch = CArrayKokkos(num_patches, num_nodes_in_patch, "mesh.nodes_in_patch"); + + // a temporary variable to help populate patch structures + CArrayKokkos num_elems_in_patch_saved(num_patches, "num_elems_in_patch_saved"); + + // initialize the number of elems in a patch saved to zero + FOR_ALL_CLASS(patch_gid, 0, num_patches, { + num_elems_in_patch_saved(patch_gid) = 0; + }); + + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { + FOR_ALL_CLASS(patch_lid, 0, num_patches_in_elem, { + size_t patch_gid = patches_in_elem(elem_gid, patch_lid); + + size_t num_saved = num_elems_in_patch_saved(patch_gid); + + elems_in_patch(patch_gid, num_saved) = elem_gid; + + // record that an elem_gid was saved + num_elems_in_patch_saved(patch_gid)++; + + // save the nodes on this patch + for (size_t patch_node_lid = 0; patch_node_lid < num_nodes_in_patch; patch_node_lid++) { + // get the local node index of the element for this patch and node in patch + size_t node_lid = node_ordering_in_elem(patch_lid, patch_node_lid); + + // get and save the global index of the node + nodes_in_patch(patch_gid, patch_node_lid) = nodes_in_elem(elem_gid, node_lid); + } // end for node_lid + }); // end FOR_ALL patch_lid + } // end for + + // Surfaces and patches in surface + if (elem_kind == mesh_init::arbitrary_tensor_element) { + // allocate memory for the surfaces in the elem + surfs_in_elem = CArrayKokkos(num_elems, num_surfs_in_elem); + + // allocate memory for surface data structures + num_surfs = num_patches / num_patches_in_surf; + + patches_in_surf = CArrayKokkos(num_surfs, num_patches_in_surf, "mesh.patches_in_surf"); + elems_in_surf = CArrayKokkos(num_surfs, 2, "mesh.elems_in_surf"); + surf_in_patch = CArrayKokkos(num_patches, "mesh.surf_in_patch"); + + FOR_ALL_CLASS(surf_gid, 0, num_surfs, { + // loop over the patches in this surface + for (size_t patch_lid = 0; patch_lid < num_patches_in_surf; patch_lid++) { + // get patch_gid + size_t patch_gid = patch_lid + surf_gid * num_patches_in_surf; + + // save the patch_gids + patches_in_surf(surf_gid, patch_lid) = patch_gid; + + // save the surface this patch belongs to + surf_in_patch(patch_gid) = surf_gid; + } // end for + + // get first patch in the surface, and populate elem surface structures + size_t this_patch_gid = surf_gid * num_patches_in_surf; + + elems_in_surf(surf_gid, 0) = elems_in_patch(this_patch_gid, 0); // elem_gid0 + elems_in_surf(surf_gid, 1) = elems_in_patch(this_patch_gid, 1); // elem_gid1 + }); // end FOR_ALL over surfaces + + // save surfaces in elem + FOR_ALL_CLASS(elem_gid, 0, num_elems, { + for (size_t surf_lid = 0; surf_lid < num_surfs_in_elem; surf_lid++) { + // get the local patch_lid + size_t patch_lid = surf_lid * num_patches_in_surf; + + // get the patch_gids in this element + size_t patch_gid = patches_in_elem(elem_gid, patch_lid); + + // save the surface gid + // Grab the first patch on surf and return surface_gid from surf_in_patch // + surfs_in_elem(elem_gid, surf_lid) = surf_in_patch(patch_gid); + } // end surf_lid + }); + + DViewCArrayKokkos surf_node_ordering_in_elem; + + if (num_dims == 3) { + // num_1D = Pn+1 + int num_surface_nodes = num_surfs_in_elem * pow(num_1D, num_dims - 1); + size_t temp_surf_node_lids[num_surface_nodes]; + // 2D arbitrary order elements + int count = 0; + + for (int i_surf = 0; i_surf < 2; i_surf++) { + for (int k = 0; k < num_1D; k++) { + for (int j = 0; j < num_1D; j++) { + // node_lid 0 in patch + // index = i + j*num_1D + k*num_1D*num_1D; + temp_surf_node_lids[count] = i_surf + j * num_1D + k * num_1D * num_1D; + count++; + } // end for k + } // end for j + } + + for (int j_surf = 0; j_surf < 2; j_surf++) { + for (int k = 0; k < num_1D; k++) { + for (int i = 0; i < num_1D; i++) { + // node_lid 0 in patch + temp_surf_node_lids[count] = i + j_surf * num_1D + k * num_1D * num_1D; + count++; + } + } + } + + for (int k_surf = 0; k_surf < 2; k_surf++) { + for (int j = 0; j < num_1D; j++) { + for (int i = 0; i < num_1D; i++) { + // node_lid 0 in patch + temp_surf_node_lids[count] = i + j * num_1D + k_surf * num_1D * num_1D; + count++; + } + } + } + + nodes_in_surf = CArrayKokkos(num_surfs, num_1D * num_1D, "mesh.nodes_in_surf"); + + num_nodes_in_surf = num_1D * num_1D; + surf_node_ordering_in_elem = DViewCArrayKokkos(&temp_surf_node_lids[0], num_surfs_in_elem, num_nodes_in_surf); + surf_node_ordering_in_elem.update_device(); + for (int elem_gid = 0; elem_gid < num_elems; elem_gid++) { + FOR_ALL_CLASS(surf_lid, 0, num_surfs_in_elem, { + int surf_gid = surfs_in_elem(elem_gid, surf_lid); + for (int surf_node_lid = 0; surf_node_lid < num_nodes_in_surf; surf_node_lid++) { + int node_lid = surf_node_ordering_in_elem(surf_lid, surf_node_lid); + int node_gid = nodes_in_elem(elem_gid, node_lid); + nodes_in_surf(surf_gid, surf_node_lid) = node_gid; + } // end loop over surf_node_lid + }); // end loop over FOR_ALL_CLASS + } // end loop over elem_gid + } // end 3D scope + } // end of high-order mesh objects + + // ---------------- + + // allocate memory for boundary patches + bdy_patches = CArrayKokkos(num_bdy_patches, "mesh.bdy_patches"); + + FOR_ALL_CLASS(bdy_patch_gid, 0, num_bdy_patches, { + bdy_patches(bdy_patch_gid) = temp_bdy_patches(bdy_patch_gid); + }); // end FOR_ALL bdy_patch_gid + + // find and store the boundary nodes + CArrayKokkos temp_bdy_nodes(num_nodes, "temp_bdy_nodes"); + CArrayKokkos hash_bdy_nodes(num_nodes, "hash_bdy_nodes"); + + FOR_ALL_CLASS(node_gid, 0, num_nodes, { + hash_bdy_nodes(node_gid) = -1; + }); // end for node_gid + + // Parallel loop over boundary patches + DCArrayKokkos num_bdy_nodes_saved(1, "num_bdy_nodes_saved"); + + RUN_CLASS({ + num_bdy_nodes_saved(0) = 0; + for (size_t bdy_patch_gid = 0; bdy_patch_gid < num_bdy_patches; bdy_patch_gid++) { + // get the global index of the patch that is on the boundary + size_t patch_gid = bdy_patches(bdy_patch_gid); + + // tag the boundary nodes + for (size_t node_lid = 0; node_lid < num_nodes_in_patch; node_lid++) { + size_t node_gid = nodes_in_patch(patch_gid, node_lid); + + if (hash_bdy_nodes(node_gid) < 0) { + hash_bdy_nodes(node_gid) = node_gid; + temp_bdy_nodes(num_bdy_nodes_saved(0)) = node_gid; + + // printf("bdy_node = %lu \n", node_gid); + num_bdy_nodes_saved(0)++; + } // end if + } // end for node_lid + } // end for loop over bdy_patch_gid + }); // end RUN + Kokkos::fence(); + + // copy value to host (CPU) + num_bdy_nodes_saved.update_host(); + Kokkos::fence(); + + // save the number of bdy_nodes to Mesh_t + num_bdy_nodes = num_bdy_nodes_saved.host(0); + + bdy_nodes = CArrayKokkos(num_bdy_nodes, "mesh.bdy_nodes"); + + FOR_ALL_CLASS(node_gid, 0, num_bdy_nodes, { + bdy_nodes(node_gid) = temp_bdy_nodes(node_gid); + }); // end for boundary node_gid + + // printf("Num boundary nodes = %lu \n", num_bdy_nodes); + + return; + } // end patch connectivity method + + // build the patches + void build_node_node_connectivity() + { + // find the max number of elems around a node + size_t max_num_elems_in_node; + size_t max_num_lcl; + FOR_REDUCE_MAX_CLASS(node_gid, 0, num_nodes, max_num_lcl, { + // num_corners_in_node = num_elems_in_node + size_t max_num = num_corners_in_node(node_gid); + + if (max_num > max_num_lcl) { + max_num_lcl = max_num; + } + }, max_num_elems_in_node); // end parallel reduction on max + Kokkos::fence(); + + // each elem corner will contribute 3 edges to the node. Those edges will likely be the same + // ones from an adjacent element so it is a safe estimate to multiply by 3 + DynamicRaggedRightArrayKokkos temp_nodes_in_nodes(num_nodes, max_num_elems_in_node * 3, "temp_nodes_in_nodes"); + + num_nodes_in_node = CArrayKokkos(num_nodes, "mesh.num_nodes_in_node"); + + // walk over the patches and save the node node connectivity + RUN_CLASS({ + if (num_dims == 3) { + for (size_t patch_gid = 0; patch_gid < num_patches; patch_gid++) { + for (size_t node_lid = 0; node_lid < num_nodes_in_patch; node_lid++) { + // the first node on the edge + size_t node_gid_0 = nodes_in_patch(patch_gid, node_lid); + + // second node on this edge + size_t node_gid_1; + + if (node_lid == num_nodes_in_patch - 1) { + node_gid_1 = nodes_in_patch(patch_gid, 0); + } + else { + node_gid_1 = nodes_in_patch(patch_gid, node_lid + 1); + } // end if + + size_t num_saved_0 = temp_nodes_in_nodes.stride(node_gid_0); + size_t num_saved_1 = temp_nodes_in_nodes.stride(node_gid_1); + + size_t save_0 = 1; + size_t save_1 = 1; + + // check to see if the node_gid_1 was already saved + for (size_t contents_lid = 0; contents_lid < num_saved_0; contents_lid++) { + if (temp_nodes_in_nodes(node_gid_0, contents_lid) == node_gid_1) { + save_0 = 0; // don't save, it was already saved + } + } + + // check to see if the node_gid_0 was already saved + for (size_t contents_lid = 0; contents_lid < num_saved_1; contents_lid++) { + if (temp_nodes_in_nodes(node_gid_1, contents_lid) == node_gid_0) { + save_1 = 0; // don't save, it was already saved + } + } + + if (save_0 == 1) { + // increment the number of nodes in a node saved + temp_nodes_in_nodes.stride(node_gid_0)++; + + // save the second node to the first node + temp_nodes_in_nodes(node_gid_0, num_saved_0) = node_gid_1; + } + + if (save_1 == 1) { + // increment the number of nodes in a node saved + temp_nodes_in_nodes.stride(node_gid_1)++; + + // save the first node to the second node + temp_nodes_in_nodes(node_gid_1, num_saved_1) = node_gid_0; + } + + // save the strides + num_nodes_in_node(node_gid_0) = temp_nodes_in_nodes.stride(node_gid_0); + num_nodes_in_node(node_gid_1) = temp_nodes_in_nodes.stride(node_gid_1); + } // end for node in patch + } // end for patches + } // end if 3D + else { + for (size_t patch_gid = 0; patch_gid < num_patches; patch_gid++) { + // the first node on the edge + size_t node_gid_0 = nodes_in_patch(patch_gid, 0); + + // second node on this edge + size_t node_gid_1 = nodes_in_patch(patch_gid, 1); + + size_t num_saved_0 = temp_nodes_in_nodes.stride(node_gid_0); + size_t num_saved_1 = temp_nodes_in_nodes.stride(node_gid_1); + + // increment the number of nodes in a node saved + temp_nodes_in_nodes.stride(node_gid_0)++; + temp_nodes_in_nodes.stride(node_gid_1)++; + + // save the second node to the first node + temp_nodes_in_nodes(node_gid_0, num_saved_0) = node_gid_1; + + // save the first node to the second node + temp_nodes_in_nodes(node_gid_1, num_saved_1) = node_gid_0; + + // save the strides + num_nodes_in_node(node_gid_0) = temp_nodes_in_nodes.stride(node_gid_0); + num_nodes_in_node(node_gid_1) = temp_nodes_in_nodes.stride(node_gid_1); + } // end for patches + } // end if 2D + }); // end RUN + Kokkos::fence(); + + nodes_in_node = RaggedRightArrayKokkos(num_nodes_in_node, "mesh.nodes_in_node"); + + // save the connectivity + FOR_ALL_CLASS(node_gid, 0, num_nodes, { + size_t num_saved = 0; + for (size_t node_lid = 0; node_lid < num_nodes_in_node(node_gid); node_lid++) { + nodes_in_node(node_gid, num_saved) = temp_nodes_in_nodes(node_gid, num_saved); + + // increment the number of nodes in node saved + num_saved++; + } // end for node_lid + }); // end parallel for over nodes + } // end of node node connectivity + + ///////////////////////////////////////////////////////////////////////////// + /// + /// \fn build_connectivity + /// + /// \brief Calls multiple build connectivity function + /// + ///////////////////////////////////////////////////////////////////////////// + void build_connectivity() + { + build_corner_connectivity(); + if (verbose) printf("Built corner connectivity \n"); + + build_elem_elem_connectivity(); + if (verbose) printf("Built element-element connectivity \n"); + + build_patch_connectivity(); + if (verbose) printf("Built patch connectivity \n"); + + build_node_node_connectivity(); + if (verbose) printf("Built node-node connectivity \n"); + } + + ///////////////////////////////////////////////////////////////////////////// + /// + /// \fn init_bdy_sets + /// + /// \brief Initialize memory for boundary sets + /// + ///////////////////////////////////////////////////////////////////////////// + void init_bdy_sets(size_t num_bcs) + { + // if (num_bcs == 0) { + // printf("ERROR: number of boundary sets = 0, set it = 1"); + // num_bcs = 1; + // } + num_bdy_sets = num_bcs; + num_bdy_patches_in_set = DCArrayKokkos(num_bcs, "mesh.num_bdy_patches_in_set"); + + // bdy_patches_in_set is a raggedRight array, it is allocated + // in tag_bdys fcn after the sparsity is known, see geometry_new.cpp + + return; + } // end of init_bdy_sets method + + +}; // end Mesh_t + +#endif \ No newline at end of file diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp new file mode 100644 index 00000000..c9e143f5 --- /dev/null +++ b/examples/mesh_decomp/mesh_decomp.cpp @@ -0,0 +1,101 @@ +// #include +// #include +// #include +// #include +// #include +// #include +// #include +// #include + + +// #include "mesh.h" +// #include "state.h" +// #include "mesh_io.h" + +#include "decomp_utils.h" + +// Include Scotch headers +#include "scotch.h" +#include "ptscotch.h" + +int main(int argc, char** argv) { + + MPI_Init(&argc, &argv); + MATAR_INITIALIZE(argc, argv); + { // MATAR scope + + int world_size; + int rank; + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + double t_main_start = MPI_Wtime(); + + // Mesh size + double origin[3] = {0.0, 0.0, 0.0}; + double length[3] = {1.0, 1.0, 1.0}; + int num_elems_dim[3] = {30, 30, 30}; + + // Initial mesh built on rank zero + Mesh_t initial_mesh; + node_t initial_node; + + // Mesh partitioned by pt-scotch, including ghost + Mesh_t final_mesh; + node_t final_node; + + GaussPoint_t gauss_point; + +// ******************************************************** +// Build the initial mesh +// ******************************************************** + + double t_init_mesh_start = MPI_Wtime(); + if (rank == 0) { + std::cout<<"World size: "< +#include "matar.h" + +namespace mesh_input +{ +// source of the mesh +enum source +{ + none = 0, ///< No source given, should fail + generate = 1, ///< Create the mesh using the mesh builder + file = 2, ///< Read in the mesh from a file +}; + +// type of mesh to generate if source = generate +enum type +{ + Box = 0, // Create the mesh using the mesh builder + Polar = 1, // Create a polar 2D mesh +}; +} // end of namespace + +static std::map mesh_input_source_map +{ + { "generate", mesh_input::generate }, + { "file", mesh_input::file } +}; + +static std::map mesh_input_type_map +{ + { "box", mesh_input::Box }, + { "polar", mesh_input::Polar } +}; + +///////////////////////////////////////////////////////////////////////////// +/// +/// \struct mesh_input_t +/// +/// \brief Meshing related input parameters +/// +///////////////////////////////////////////////////////////////////////////// +struct mesh_input_t +{ + int num_dims = 3; ///< Number of dimensions for the mesh + mesh_input::source source = mesh_input::none; ///< Source of mesh, file or generate + std::string file_path = ""; ///< Absolute path of mesh file + mesh_input::type type; ///< Type of mesh to generate if + + double origin[3] = { 0.0, 0.0, 0.0 }; ///< Mesh origin for generating a mesh + double length[3] = { 0.0, 0.0, 0.0 }; ///< x,y,z length of generated mesh + size_t num_elems[3] = { 1, 1, 1 }; ///< Number of elements along x,y, z for generating a mesh. + + size_t p_order = 1; + + // WARNING, NOT YET PARSED + double inner_radius = 0.0; ///< Inner radius for generating 2D RZ mesh + double outer_radius = 1.0; ///< Outer radius for generating 2D RZ mesh + double starting_angle = 0.0; ///< Starting angle in degrees for 2D RZ mesh + double ending_angle = 90; ///< Ending angle in degrees for 2D RZ mesh + + int num_radial_elems = 10; ///< Number of elements in the radial direction for 2DRZ mesh + int num_angular_elems = 10; ///< Number of elements in the radial direction for 2DRZ mesh + + double scale_x = 1.0; ///< Scales mesh x coordinate dimensions + double scale_y = 1.0; ///< Scales mesh y coordinate dimensions + double scale_z = 1.0; ///< Scales mesh z coordinate dimensions + + DCArrayKokkos object_ids; ///< the object_ids in the vtu full mesh file (from exodus mesh) + +}; // mesh_input_t + +// ---------------------------------- +// valid inputs for mesh options +// ---------------------------------- +static std::vector str_mesh_inps +{ + "num_dims", + "source", + "file_path", + "type", + "origin", + "length", + "num_elems", + "polynomial_order", + "inner_radius", + "outer_radius", + "starting_angle", + "ending_angle", + "num_radial_elems", + "num_angular_elems", + "scale_x", + "scale_y", + "scale_z" +}; + +// ---------------------------------- +// required inputs for mesh options +// ---------------------------------- +static std::vector mesh_required_inps +{ + "source", + "num_dims", +}; + +#endif // end Header Guard \ No newline at end of file diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h new file mode 100644 index 00000000..aec7a963 --- /dev/null +++ b/examples/mesh_decomp/mesh_io.h @@ -0,0 +1,1061 @@ +#ifndef MESH_IO_H +#define MESH_IO_H + +#include "matar.h" +#include "mesh.h" +#include "state.h" + +using namespace mtr; + +#include +#include +#include +#include +#include +#include // for string pattern recoginition +#include +#include +#include +#include +#include +#include + + + +///////////////////////////////////////////////////////////////////////////// +/// +/// \fn split +/// +/// \brief Splits a string by a given delimiter +/// +/// \param Input string +/// \param delimiter +/// +/// \return Vector of split string values +/// +///////////////////////////////////////////////////////////////////////////// +inline std::vector split(std::string s, std::string delimiter) +{ + size_t pos_start = 0, pos_end, delim_len = delimiter.length(); + std::string token; + std::vector res; + + while ((pos_end = s.find(delimiter, pos_start)) != std::string::npos) { + token = s.substr(pos_start, pos_end - pos_start); + pos_start = pos_end + delim_len; + res.push_back(token); + } + + res.push_back(s.substr(pos_start)); + return res; +} // end of split + +///////////////////////////////////////////////////////////////////////////// +/// +/// \fn get_id +/// +/// \brief This gives the index value of the point or the elem +/// +/// Assumes that the grid has an i,j,k structure +/// the elem = i + (j)*(num_points_i-1) + (k)*(num_points_i-1)*(num_points_j-1) +/// the point = i + (j)*num_points_i + (k)*num_points_i*num_points_j +/// +/// \param i index +/// \param j index +/// \param k index +/// \param Number of i indices +/// \param Number of j indices +/// +///////////////////////////////////////////////////////////////////////////// +KOKKOS_INLINE_FUNCTION +size_t get_id(int i, int j, int k, int num_i, int num_j) +{ + return i + j * num_i + k * num_i * num_j; +} // end get_id + +///////////////////////////////////////////////////////////////////////////// +/// +/// \fn PointIndexFromIJK +/// +/// \brief Given (i,j,k) coordinates within the Lagrange hex, return an +/// offset into the local connectivity (PointIds) array. The order parameter +/// must point to an array of 3 integers specifying the order along each +/// axis of the hexahedron. +/// +///////////////////////////////////////////////////////////////////////////// +inline int PointIndexFromIJK(int i, int j, int k, const int* order) +{ + bool ibdy = (i == 0 || i == order[0]); + bool jbdy = (j == 0 || j == order[1]); + bool kbdy = (k == 0 || k == order[2]); + // How many boundaries do we lie on at once? + int nbdy = (ibdy ? 1 : 0) + (jbdy ? 1 : 0) + (kbdy ? 1 : 0); + + if (nbdy == 3) { // Vertex DOF + // ijk is a corner node. Return the proper index (somewhere in [0,7]): + return (i ? (j ? 2 : 1) : (j ? 3 : 0)) + (k ? 4 : 0); + } + + int offset = 8; + if (nbdy == 2) { // Edge DOF + if (!ibdy) { // On i axis + return (i - 1) + (j ? order[0] - 1 + order[1] - 1 : 0) + (k ? 2 * (order[0] - 1 + order[1] - 1) : 0) + offset; + } + if (!jbdy) { // On j axis + return (j - 1) + (i ? order[0] - 1 : 2 * (order[0] - 1) + order[1] - 1) + (k ? 2 * (order[0] - 1 + order[1] - 1) : 0) + offset; + } + // !kbdy, On k axis + offset += 4 * (order[0] - 1) + 4 * (order[1] - 1); + return (k - 1) + (order[2] - 1) * (i ? (j ? 3 : 1) : (j ? 2 : 0)) + offset; + } + + offset += 4 * (order[0] - 1 + order[1] - 1 + order[2] - 1); + if (nbdy == 1) { // Face DOF + if (ibdy) { // On i-normal face + return (j - 1) + ((order[1] - 1) * (k - 1)) + (i ? (order[1] - 1) * (order[2] - 1) : 0) + offset; + } + offset += 2 * (order[1] - 1) * (order[2] - 1); + if (jbdy) { // On j-normal face + return (i - 1) + ((order[0] - 1) * (k - 1)) + (j ? (order[2] - 1) * (order[0] - 1) : 0) + offset; + } + offset += 2 * (order[2] - 1) * (order[0] - 1); + // kbdy, On k-normal face + return (i - 1) + ((order[0] - 1) * (j - 1)) + (k ? (order[0] - 1) * (order[1] - 1) : 0) + offset; + } + + // nbdy == 0: Body DOF + offset += 2 * ( (order[1] - 1) * (order[2] - 1) + (order[2] - 1) * (order[0] - 1) + (order[0] - 1) * (order[1] - 1)); + return offset + (i - 1) + (order[0] - 1) * ( (j - 1) + (order[1] - 1) * ( (k - 1))); +} + +///////////////////////////////////////////////////////////////////////////// +/// +/// \fn build_3d_box +/// +/// \brief Builds an unstructured 3D rectilinear mesh +/// +/// \param Simulation mesh that is built +/// \param Element state data +/// \param Node state data +/// \param origin The origin of the mesh +/// \param length The length of the mesh +/// \param num_elems The number of elements in the mesh +/// +///////////////////////////////////////////////////////////////////////////// +void build_3d_box( + Mesh_t& mesh, + node_t& node, + double origin[3], + double length[3], + int num_elems_dim[3]) +{ + printf("Creating a 3D box mesh \n"); + + const int num_dim = 3; + + // Note: In fierro, these come from the simulation parameters + const double lx = length[0]; + const double ly = length[1]; + const double lz = length[2]; + + // Note: In fierro, these come from the simulation parameters + const int num_elems_i = num_elems_dim[0]; + const int num_elems_j = num_elems_dim[1]; + const int num_elems_k = num_elems_dim[2]; + + const int num_points_i = num_elems_i + 1; // num points in x + const int num_points_j = num_elems_j + 1; // num points in y + const int num_points_k = num_elems_k + 1; // num points in y + + const int num_nodes = num_points_i * num_points_j * num_points_k; + + const double dx = lx / ((double)num_elems_i); // len/(num_elems_i) + const double dy = ly / ((double)num_elems_j); // len/(num_elems_j) + const double dz = lz / ((double)num_elems_k); // len/(num_elems_k) + + const int num_elems = num_elems_i * num_elems_j * num_elems_k; + + // --- 3D parameters --- + // const int num_faces_in_elem = 6; // number of faces in elem + // const int num_points_in_elem = 8; // number of points in elem + // const int num_points_in_face = 4; // number of points in a face + // const int num_edges_in_elem = 12; // number of edges in a elem + + // initialize mesh node variables + mesh.initialize_nodes(num_nodes); + + // initialize node state variables, for now, we just need coordinates, the rest will be initialize by the respective solvers + std::vector required_node_state = { node_state::coords }; + node.initialize(num_nodes, num_dim, required_node_state); + + // --- Build nodes --- + + CArrayDual origin_mtr(3, "origin_mtr"); + origin_mtr.host(0) = origin[0]; + origin_mtr.host(1) = origin[1]; + origin_mtr.host(2) = origin[2]; + origin_mtr.update_device(); + + // populate the point data structures + FOR_ALL(k, 0, num_points_k, + j, 0, num_points_j, + i, 0, num_points_i,{ + + // global id for the point + size_t node_gid = get_id(i, j, k, num_points_i, num_points_j); + + // store the point coordinates + node.coords(node_gid, 0) = origin_mtr(0) + (double)i * dx; + node.coords(node_gid, 1) = origin_mtr(1) + (double)j * dy; + node.coords(node_gid, 2) = origin_mtr(2) + (double)k * dz; + }); + // Update the host side + node.coords.update_host(); + + // initialize elem variables + mesh.initialize_elems(num_elems, num_dim); + + // populate the point data structures + FOR_ALL(k, 0, num_elems_k, + j, 0, num_elems_j, + i, 0, num_elems_i,{ + + // global id for the elem + size_t elem_gid = get_id(i, j, k, num_elems_i, num_elems_j); + + // store the point IDs for this elem where the range is + // (i:i+1, j:j+1, k:k+1) for a linear hexahedron + int this_point = 0; + for (int kcount = k; kcount <= k + 1; kcount++) { + for (int jcount = j; jcount <= j + 1; jcount++) { + for (int icount = i; icount <= i + 1; icount++) { + // global id for the points + size_t node_gid = get_id(icount, jcount, kcount, + num_points_i, num_points_j); + + // convert this_point index to the FE index convention + int this_index = this_point; //convert_point_number_in_Hex(this_point); + + // store the points in this elem according the the finite + // element numbering convention + mesh.nodes_in_elem(elem_gid, this_index) = node_gid; + + // increment the point counting index + this_point++; + } // end for icount + } // end for jcount + } // end for kcount + }); // end parallel for + + // Update the host side + mesh.nodes_in_elem.update_host(); + + Kokkos::fence(); + + // Build connectivity + mesh.build_connectivity(); +} // end build_3d_box + + + +///////////////////////////////////////////////////////////////////////////// +/// +/// \fn write_vtk +/// +/// \brief Writes a vtk output file +/// +/// \param mesh mesh +/// \param node node data +/// \param rank rank +/// +///////////////////////////////////////////////////////////////////////////// + void write_vtk(Mesh_t& mesh, + node_t& node, + int rank) + { + + CArray graphics_times(1); + int graphics_id = 0; + graphics_times(0) = 0.0; + + // ---- Update host data ---- + + node.coords.update_host(); + + Kokkos::fence(); + + + const int num_cell_scalar_vars = 3; + const int num_cell_vec_vars = 0; + const int num_cell_tensor_vars = 0; + + const int num_point_scalar_vars = 3; + const int num_point_vec_vars = 2; + + + // Scalar values associated with a cell + const char cell_scalar_var_names[num_cell_scalar_vars][30] = { + "rank_id", "elems_in_elem_owned", "global_elem_id" + }; + + // const char cell_vec_var_names[num_cell_vec_vars][15] = { + + // }; + + const char point_scalar_var_names[num_point_scalar_vars][15] = { + "rank_id", "elems_in_node", "scalar_field" + }; + + const char point_vec_var_names[num_point_vec_vars][15] = { + "pos", "vector_field" + }; + + // short hand + const size_t num_nodes = mesh.num_owned_nodes; + const size_t num_elems = mesh.num_owned_elems; + const size_t num_dims = mesh.num_dims; + + + // save the cell state to an array for exporting to graphics files + auto elem_fields = CArray(num_elems, num_cell_scalar_vars); + int elem_switch = 1; + + + // save the output scale fields to a single 2D array + + + // export material centeric data to the elements + + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { + elem_fields(elem_gid, 0) = rank; + elem_fields(elem_gid, 1) = (double)mesh.num_elems_in_elem(elem_gid); + elem_fields(elem_gid, 2) = mesh.local_to_global_elem_mapping.host(elem_gid); + } + + + // save the vertex vector fields to an array for exporting to graphics files + CArray vec_fields(num_nodes, num_point_vec_vars, 3); + CArray point_scalar_fields(num_nodes, num_point_scalar_vars); + + for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) { + // position, var 0 + vec_fields(node_gid, 0, 0) = node.coords.host(node_gid, 0); + vec_fields(node_gid, 0, 1) = node.coords.host(node_gid, 1); + vec_fields(node_gid, 0, 2) = node.coords.host(node_gid, 2); + + // vector field, var 1 + vec_fields(node_gid, 1, 0) = node.vector_field.host(node_gid, 0); + vec_fields(node_gid, 1, 1) = node.vector_field.host(node_gid, 1); + vec_fields(node_gid, 1, 2) = node.vector_field.host(node_gid, 2); + + point_scalar_fields(node_gid, 0) = rank; + point_scalar_fields(node_gid, 1) = (double)mesh.num_corners_in_node(node_gid); + point_scalar_fields(node_gid, 2) = node.scalar_field.host(node_gid); + + } // end for loop over vertices + + + FILE* out[20]; // the output files that are written to + char filename[100]; // char string + int max_len = sizeof filename; + int str_output_len; + + struct stat st; + + if (stat("vtk", &st) != 0) { + system("mkdir vtk"); + } + + // snprintf(filename, max_len, "ensight/data/%s.%05d.%s", name, graphics_id, vec_var_names[var]); + + //sprintf(filename, "vtk/Fierro.%05d.vtk", graphics_id); // mesh file + str_output_len = snprintf(filename, max_len, "vtk/Fierro.%05d_rank%d.vtk", graphics_id, rank); + if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); } + // mesh file + + out[0] = fopen(filename, "w"); + + fprintf(out[0], "# vtk DataFile Version 2.0\n"); // part 2 + fprintf(out[0], "Mesh for Fierro\n"); // part 2 + fprintf(out[0], "ASCII \n"); // part 3 + fprintf(out[0], "DATASET UNSTRUCTURED_GRID\n\n"); // part 4 + + fprintf(out[0], "POINTS %zu float\n", num_nodes); + + // write all components of the point coordinates + for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) { + fprintf(out[0], + "%f %f %f\n", + node.coords.host(node_gid, 0), + node.coords.host(node_gid, 1), + node.coords.host(node_gid, 2)); + } // end for + + /* + --------------------------------------------------------------------------- + Write the elems + --------------------------------------------------------------------------- + */ + + fprintf(out[0], "\n"); + fprintf(out[0], "CELLS %lu %lu\n", num_elems, num_elems + num_elems * mesh.num_nodes_in_elem); // size=all printed values + + int Pn_order = mesh.Pn; + int order[3] = { Pn_order, Pn_order, Pn_order }; + + // const int num_1D_points = Pn_order+1; + + // write all global point numbers for this elem + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { + fprintf(out[0], "%lu ", mesh.num_nodes_in_elem); // num points in this elem + + for (int k = 0; k <= Pn_order; k++) { + for (int j = 0; j <= Pn_order; j++) { + for (int i = 0; i <= Pn_order; i++) { + size_t node_lid = PointIndexFromIJK(i, j, k, order); + fprintf(out[0], "%lu ", mesh.nodes_in_elem.host(elem_gid, node_lid)); + } + } + } + + fprintf(out[0], "\n"); + } // end for + + // Write the element types + fprintf(out[0], "\n"); + fprintf(out[0], "CELL_TYPES %zu \n", num_elems); + // VTK_LAGRANGE_HEXAHEDRON: 72, + // VTK_HIGHER_ORDER_HEXAHEDRON: 67 + // VTK_BIQUADRATIC_QUADRATIC_HEXAHEDRON = 33 + // element types: https://vtk.org/doc/nightly/html/vtkCellType_8h_source.html + // element types: https://kitware.github.io/vtk-js/api/Common_DataModel_CellTypes.html + // vtk format: https://www.kitware.com//modeling-arbitrary-order-lagrange-finite-elements-in-the-visualization-toolkit/ + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { + fprintf(out[0], "%d \n", 72); + } + + /* + --------------------------------------------------------------------------- + Write the nodal vector variables to file + --------------------------------------------------------------------------- + */ + + fprintf(out[0], "\n"); + fprintf(out[0], "POINT_DATA %zu \n", num_nodes); + + // vtk vector vars = (position, velocity) + for (int var = 0; var < num_point_vec_vars; var++) { + fprintf(out[0], "VECTORS %s float \n", point_vec_var_names[var]); + for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) { + fprintf(out[0], "%f %f %f\n", + vec_fields(node_gid, var, 0), + vec_fields(node_gid, var, 1), + vec_fields(node_gid, var, 2)); + } // end for nodes + } // end for vec_vars + + + // vtk scalar vars = (rank_id, elems_in_node) + for (int var = 0; var < num_point_scalar_vars; var++) { + fprintf(out[0], "SCALARS %s float 1\n", point_scalar_var_names[var]); + fprintf(out[0], "LOOKUP_TABLE default\n"); + for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) { + fprintf(out[0], "%f\n", + point_scalar_fields(node_gid, var)); + } // end for nodes + } // end for scalar_vars + + /* + --------------------------------------------------------------------------- + Write the scalar elem variable to file + --------------------------------------------------------------------------- + */ + fprintf(out[0], "\n"); + fprintf(out[0], "CELL_DATA %zu \n", num_elems); + + for (int var = 0; var < num_cell_scalar_vars; var++) { + fprintf(out[0], "SCALARS %s float 1\n", cell_scalar_var_names[var]); // the 1 is number of scalar components [1:4] + fprintf(out[0], "LOOKUP_TABLE default\n"); + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { + fprintf(out[0], "%f\n", elem_fields(elem_gid, var)); + } // end for elem + } // end for cell scalar_vars + + fclose(out[0]); + + // graphics_times(graphics_id) = time_value; + + // Write time series metadata + //sprintf(filename, "vtk/Fierro.vtk.series", graphics_id); // mesh file + str_output_len = snprintf(filename, max_len, "vtk/Fierro.vtk.series"); + if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); } + // mesh file + + out[0] = fopen(filename, "w"); + + fprintf(out[0], "{\n"); + fprintf(out[0], " \"file-series-version\" : \"1.0\",\n"); + fprintf(out[0], " \"files\" : [\n"); + + for (int i = 0; i <= graphics_id; i++) { + fprintf(out[0], " { \"name\" : \"Fierro.%05d.vtk\", \"time\" : %12.5e },\n", i, graphics_times(i) ); + } + + // fprintf(out[0], "%12.5e\n", graphics_times(i)); + fprintf(out[0], " ]\n"); // part 4 + fprintf(out[0], "}"); // part 4 + + fclose(out[0]); + + // increment graphics id counter + // graphics_id++; + + + } // end write vtk old + + +///////////////////////////////////////////////////////////////////////////// +/// +/// \fn write_vtu +/// +/// \brief Writes a VTU (XML VTK) output file per MPI rank and a PVTU file +/// for parallel visualization in ParaView +/// +/// \param mesh mesh +/// \param node node data +/// \param rank MPI rank +/// \param comm MPI communicator +/// +///////////////////////////////////////////////////////////////////////////// +void write_vtu(Mesh_t& mesh, + node_t& node, + GaussPoint_t& gauss_point, + int rank, + MPI_Comm comm) +{ + int world_size; + MPI_Comm_size(comm, &world_size); + + CArray graphics_times(1); + int graphics_id = 0; + graphics_times(0) = 0.0; + + // ---- Update host data ---- + node.coords.update_host(); + Kokkos::fence(); + + const int num_cell_scalar_vars = 4; + const int num_cell_vec_vars = 1; + const int num_cell_tensor_vars = 0; + + const int num_point_scalar_vars = 4; + const int num_point_vec_vars = 2; + + // Scalar values associated with a cell + const char cell_scalar_var_names[num_cell_scalar_vars][30] = { + "rank_id", "elems_in_elem_owned", "global_elem_id", "field_value" + }; + + const char cell_vec_var_names[num_cell_vec_vars][15] = { + "field_vec" + }; + + const char point_scalar_var_names[num_point_scalar_vars][15] = { + "rank_id", "elems_in_node", "global_node_id", "scalar_field" + }; + + const char point_vec_var_names[num_point_vec_vars][15] = { + "pos", "vector_field" + }; + + // short hand + const size_t num_nodes = mesh.num_owned_nodes; + const size_t num_elems = mesh.num_owned_elems; + const size_t num_dims = mesh.num_dims; + + // save the cell state to an array for exporting to graphics files + auto elem_fields = CArray(num_elems, num_cell_scalar_vars); + auto elem_vec_fields = CArray(num_elems, num_cell_vec_vars, 3); + + DCArrayKokkos num_elems_in_elem(mesh.num_elems, "tmp_num_elem_in_elem"); + FOR_ALL(i, 0, mesh.num_elems, { + num_elems_in_elem(i) = (double)mesh.num_elems_in_elem(i); + }); + MATAR_FENCE(); + num_elems_in_elem.update_host(); + MATAR_FENCE(); + + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { + elem_fields(elem_gid, 0) = rank; + elem_fields(elem_gid, 1) = num_elems_in_elem.host(elem_gid); + elem_fields(elem_gid, 2) = mesh.local_to_global_elem_mapping.host(elem_gid); + elem_fields(elem_gid, 3) = gauss_point.fields.host(elem_gid); + elem_vec_fields(elem_gid, 0, 0) = gauss_point.fields_vec.host(elem_gid, 0); + elem_vec_fields(elem_gid, 0, 1) = gauss_point.fields_vec.host(elem_gid, 1); + elem_vec_fields(elem_gid, 0, 2) = gauss_point.fields_vec.host(elem_gid, 2); + } + + // save the vertex vector fields to an array for exporting to graphics files + CArray vec_fields(num_nodes, num_point_vec_vars, 3); + CArray point_scalar_fields(num_nodes, num_point_scalar_vars); + + + DCArrayKokkos num_elems_in_node(mesh.num_elems, "tmp_num_elems_in_node"); + FOR_ALL(i, 0, mesh.num_elems, { + num_elems_in_node(i) = (double)mesh.num_corners_in_node(i); + }); + MATAR_FENCE(); + num_elems_in_node.update_host(); + + for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) { + // position, var 0 + vec_fields(node_gid, 0, 0) = node.coords.host(node_gid, 0); + vec_fields(node_gid, 0, 1) = node.coords.host(node_gid, 1); + vec_fields(node_gid, 0, 2) = node.coords.host(node_gid, 2); + + // vector field, var 1 + vec_fields(node_gid, 1, 0) = node.vector_field.host(node_gid, 0); + vec_fields(node_gid, 1, 1) = node.vector_field.host(node_gid, 1); + vec_fields(node_gid, 1, 2) = node.vector_field.host(node_gid, 2); + + point_scalar_fields(node_gid, 0) = rank; + point_scalar_fields(node_gid, 1) = num_elems_in_node.host(node_gid); + point_scalar_fields(node_gid, 2) = (double)mesh.local_to_global_node_mapping.host(node_gid); + point_scalar_fields(node_gid, 3) = node.scalar_field.host(node_gid); + } + + // File management + char filename[200]; + int max_len = sizeof filename; + int str_output_len; + + struct stat st; + if (stat("vtk", &st) != 0) { + system("mkdir vtk"); + } + + // Create VTU filename for this rank + str_output_len = snprintf(filename, max_len, "vtk/Fierro.%05d_rank%d.vtu", graphics_id, rank); + if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); } + + FILE* vtu_file = fopen(filename, "w"); + if (!vtu_file) { + std::cerr << "[rank " << rank << "] Failed to open VTU file: " << filename << std::endl; + return; + } + + // Write VTU XML header + fprintf(vtu_file, "\n"); + fprintf(vtu_file, "\n"); + fprintf(vtu_file, " \n"); + fprintf(vtu_file, " \n", num_nodes, num_elems); + + // Write Points (coordinates) + fprintf(vtu_file, " \n"); + fprintf(vtu_file, " \n"); + for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) { + fprintf(vtu_file, " %f %f %f\n", + node.coords.host(node_gid, 0), + node.coords.host(node_gid, 1), + node.coords.host(node_gid, 2)); + } + fprintf(vtu_file, " \n"); + fprintf(vtu_file, " \n"); + + // Write Cells (connectivity) + fprintf(vtu_file, " \n"); + + // Connectivity array - all node indices for all cells, space-separated + fprintf(vtu_file, " \n"); + int Pn_order = mesh.Pn; + int order[3] = { Pn_order, Pn_order, Pn_order }; + + // Write connectivity: all node IDs for all elements, space-separated + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { + for (int k = 0; k <= Pn_order; k++) { + for (int j = 0; j <= Pn_order; j++) { + for (int i = 0; i <= Pn_order; i++) { + size_t node_lid = PointIndexFromIJK(i, j, k, order); + fprintf(vtu_file, " %zu", static_cast(mesh.nodes_in_elem.host(elem_gid, node_lid))); + } + } + } + } + fprintf(vtu_file, "\n"); + fprintf(vtu_file, " \n"); + + // Offsets array - cumulative index where each cell's connectivity ends + fprintf(vtu_file, " \n"); + int offset = 0; + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { + offset += static_cast(mesh.num_nodes_in_elem); + fprintf(vtu_file, " %d", offset); + } + fprintf(vtu_file, "\n"); + fprintf(vtu_file, " \n"); + + // Types array (72 = VTK_LAGRANGE_HEXAHEDRON) + fprintf(vtu_file, " \n"); + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { + fprintf(vtu_file, " 72"); + } + fprintf(vtu_file, "\n"); + fprintf(vtu_file, " \n"); + fprintf(vtu_file, " \n"); + + // Write PointData (node fields) + fprintf(vtu_file, " \n"); + + // Point vector variables + for (int var = 0; var < num_point_vec_vars; var++) { + fprintf(vtu_file, " \n", + point_vec_var_names[var]); + for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) { + fprintf(vtu_file, " %f %f %f\n", + vec_fields(node_gid, var, 0), + vec_fields(node_gid, var, 1), + vec_fields(node_gid, var, 2)); + } + fprintf(vtu_file, " \n"); + } + + // Point scalar variables + for (int var = 0; var < num_point_scalar_vars; var++) { + fprintf(vtu_file, " \n", + point_scalar_var_names[var]); + for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) { + fprintf(vtu_file, " %f\n", point_scalar_fields(node_gid, var)); + } + fprintf(vtu_file, " \n"); + } + fprintf(vtu_file, " \n"); + + // Write CellData (element fields) + fprintf(vtu_file, " \n"); + + // Cell vector variables + for (int var = 0; var < num_cell_vec_vars; var++) { + fprintf(vtu_file, " \n", + cell_vec_var_names[var]); + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { + // TODO: Populate cell vector field data from appropriate source + fprintf(vtu_file, " %f %f %f\n", + gauss_point.fields_vec.host(elem_gid, 0), + gauss_point.fields_vec.host(elem_gid, 1), + gauss_point.fields_vec.host(elem_gid, 2)); + } + fprintf(vtu_file, " \n"); + } + + // Cell scalar variables + for (int var = 0; var < num_cell_scalar_vars; var++) { + fprintf(vtu_file, " \n", + cell_scalar_var_names[var]); + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { + fprintf(vtu_file, " %f\n", elem_fields(elem_gid, var)); + } + fprintf(vtu_file, " \n"); + } + fprintf(vtu_file, " \n"); + + // Close VTU file + fprintf(vtu_file, " \n"); + fprintf(vtu_file, " \n"); + fprintf(vtu_file, "\n"); + fclose(vtu_file); + + // Write PVTU file (only rank 0, after all ranks have written their VTU files) + MPI_Barrier(comm); + + if (rank == 0) { + str_output_len = snprintf(filename, max_len, "vtk/Fierro.%05d.pvtu", graphics_id); + if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); } + + FILE* pvtu_file = fopen(filename, "w"); + if (!pvtu_file) { + std::cerr << "[rank 0] Failed to open PVTU file: " << filename << std::endl; + return; + } + + // Write PVTU XML header + fprintf(pvtu_file, "\n"); + fprintf(pvtu_file, "\n"); + fprintf(pvtu_file, " \n"); + + // Write PPoints + fprintf(pvtu_file, " \n"); + fprintf(pvtu_file, " \n"); + fprintf(pvtu_file, " \n"); + + // Write PCells + fprintf(pvtu_file, " \n"); + fprintf(pvtu_file, " \n"); + fprintf(pvtu_file, " \n"); + fprintf(pvtu_file, " \n"); + fprintf(pvtu_file, " \n"); + + // Write PPointData + fprintf(pvtu_file, " \n"); + for (int var = 0; var < num_point_vec_vars; var++) { + fprintf(pvtu_file, " \n", + point_vec_var_names[var]); + } + for (int var = 0; var < num_point_scalar_vars; var++) { + fprintf(pvtu_file, " \n", + point_scalar_var_names[var]); + } + fprintf(pvtu_file, " \n"); + + // Write PCellData + fprintf(pvtu_file, " \n"); + for (int var = 0; var < num_cell_vec_vars; var++) { + fprintf(pvtu_file, " \n", + cell_vec_var_names[var]); + } + for (int var = 0; var < num_cell_scalar_vars; var++) { + fprintf(pvtu_file, " \n", + cell_scalar_var_names[var]); + } + fprintf(pvtu_file, " \n"); + + // Write Piece references for each rank + for (int r = 0; r < world_size; r++) { + fprintf(pvtu_file, " \n", graphics_id, r); + } + + // Close PVTU file + fprintf(pvtu_file, " \n"); + fprintf(pvtu_file, "\n"); + fclose(pvtu_file); + } + +} // end write_vtu + + + ///////////////////////////////////////////////////////////////////////////// + /// + /// \fn read_vtk_mesh + /// + /// \brief Read ASCII .vtk mesh file + /// + /// \param Simulation mesh + /// \param Simulation state + /// \param Node state struct + /// \param Number of dimensions + /// + ///////////////////////////////////////////////////////////////////////////// + void read_vtk_mesh(Mesh_t& mesh, + node_t& node, + int num_dims, + std::string mesh_file_) +{ + + std::cout<<"Reading VTK mesh"< v = split (str, delimiter); + + // looking for the following text: + // POINTS %d float + if(v[0] == "POINTS"){ + size_t num_nodes = std::stoi(v[1]); + printf("Number of nodes read in %zu\n", num_nodes); + mesh.initialize_nodes(num_nodes); + + std::vector required_node_state = { node_state::coords }; + node.initialize(num_nodes, num_dims, required_node_state); + + found=true; + } // end if + + + if (i>1000){ + std::cerr << "ERROR: Failed to find POINTS in file" << std::endl; + break; + } // end if + + i++; + } // end while + + // read the node coordinates + for (node_gid=0; node_gid v = split (str, delimiter); + + // save the nodal coordinates + node.coords.host(node_gid, 0) = std::stod(v[0]); // double + node.coords.host(node_gid, 1) = std::stod(v[1]); // double + if(num_dims==3){ + node.coords.host(node_gid, 2) = std::stod(v[2]); // double + } + + } // end for nodes + + + // Update device nodal positions + node.coords.update_device(); + + + found=false; + + // look for CELLS + i = 0; + size_t num_elem = 0; + while (found==false) { + std::string str; + std::getline(in, str); + + std::string delimiter = " "; + std::vector v = split (str, delimiter); + std::cout << v[0] << std::endl; // printing + + // looking for the following text: + // CELLS num_elem size + if(v[0] == "CELLS"){ + num_elem = std::stoi(v[1]); + printf("Number of elements read in %zu\n", num_elem); + + // initialize elem variables + mesh.initialize_elems(num_elem, num_dims); + + found=true; + } // end if + + + if (i>1000){ + printf("ERROR: Failed to find CELLS \n"); + break; + } // end if + + i++; + } // end while + + + // read the node ids in the element + for (elem_gid=0; elem_gid v = split (str, delimiter); + num_nodes_in_elem = std::stoi(v[0]); + + for (size_t node_lid=0; node_lid v = split (str, delimiter); + + // looking for the following text: + // CELLS num_elem size + if(v[0] == "CELL_TYPES"){ + + std::getline(in, str); + elem_type = std::stoi(str); + + found=true; + } // end if + + + if (i>1000){ + printf("ERROR: Failed to find elem_TYPE \n"); + break; + } // end if + + i++; + } // end while + printf("Element type = %zu \n", elem_type); + // elem types: + // linear hex = 12, linear quad = 9 + found=false; + + + if(num_nodes_in_elem==8 & elem_type != 12) { + printf("Wrong element type of %zu \n", elem_type); + std::cerr << "ERROR: incorrect element type in VTK file" << std::endl; + } + + in.close(); + +} // end of VTKread function + +#endif \ No newline at end of file diff --git a/examples/mesh_decomp/state.h b/examples/mesh_decomp/state.h new file mode 100644 index 00000000..eb3d5a6b --- /dev/null +++ b/examples/mesh_decomp/state.h @@ -0,0 +1,206 @@ +/********************************************************************************************** +� 2020. Triad National Security, LLC. All rights reserved. +This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos +National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S. +Department of Energy/National Nuclear Security Administration. All rights in the program are +reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear +Security Administration. The Government is granted for itself and others acting on its behalf a +nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare +derivative works, distribute copies to the public, perform publicly and display publicly, and +to permit others to do so. +This program is open source under the BSD-3 License. +Redistribution and use in source and binary forms, with or without modification, are permitted +provided that the following conditions are met: +1. Redistributions of source code must retain the above copyright notice, this list of +conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, this list of +conditions and the following disclaimer in the documentation and/or other materials +provided with the distribution. +3. Neither the name of the copyright holder nor the names of its contributors may be used +to endorse or promote products derived from this software without specific prior +written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************************/ +#ifndef STATE_H +#define STATE_H + +#include "matar.h" +// #include "mpi_type.h" + +using namespace mtr; + + +// Possible node states, used to initialize node_t +enum class node_state +{ + coords, + scalar_field, + vector_field +}; + + +///////////////////////////////////////////////////////////////////////////// +/// +/// \struct node_t +/// +/// \brief Stores state information associated with a node +/// +///////////////////////////////////////////////////////////////////////////// +struct node_t +{ + + // Replace with MPIDCArrayKokkos + MPICArrayKokkos coords; ///< Nodal coordinates + MPICArrayKokkos coords_n0; ///< Nodal coordinates at tn=0 of time integration + + MPICArrayKokkos scalar_field; ///< Scalar field on a node + MPICArrayKokkos vector_field; ///< Vector field on a node + + + // initialization method (num_nodes, num_dims, state to allocate) + void initialize(size_t num_nodes, size_t num_dims, std::vector node_states) + { + + CommunicationPlan comm_plan; + + for (auto field : node_states){ + switch(field){ + case node_state::coords: + if (coords.size() == 0){ + this->coords = MPICArrayKokkos(num_nodes, num_dims, "node_coordinates"); + this->coords.initialize_comm_plan(comm_plan); + } + if (coords_n0.size() == 0){ + this->coords_n0 = MPICArrayKokkos(num_nodes, num_dims, "node_coordinates_n0"); + this->coords_n0.initialize_comm_plan(comm_plan); + } + break; + case node_state::scalar_field: + if (scalar_field.size() == 0) this->scalar_field = MPICArrayKokkos(num_nodes, "node_scalar_field"); + this->scalar_field.initialize_comm_plan(comm_plan); + break; + case node_state::vector_field: + if (vector_field.size() == 0) this->vector_field = MPICArrayKokkos(num_nodes, num_dims, "node_vector_field"); + this->vector_field.initialize_comm_plan(comm_plan); + break; + default: + std::cout<<"Desired node state not understood in node_t initialize"< node_states, CommunicationPlan& comm_plan) + { + for (auto field : node_states){ + switch(field){ + case node_state::coords: + if (coords.size() == 0){ + this->coords = MPICArrayKokkos(num_nodes, num_dims, "node_coordinates"); + this->coords.initialize_comm_plan(comm_plan); + } + if (coords_n0.size() == 0){ + this->coords_n0 = MPICArrayKokkos(num_nodes, num_dims, "node_coordinates_n0"); + this->coords_n0.initialize_comm_plan(comm_plan); + } + break; + case node_state::scalar_field: + if (scalar_field.size() == 0) this->scalar_field = MPICArrayKokkos(num_nodes, "node_scalar_field"); + this->scalar_field.initialize_comm_plan(comm_plan); + break; + case node_state::vector_field: + if (vector_field.size() == 0) this->vector_field = MPICArrayKokkos(num_nodes, num_dims, "node_vector_field"); + this->vector_field.initialize_comm_plan(comm_plan); + break; + default: + std::cout<<"Desired node state not understood in node_t initialize"< fields; + MPICArrayKokkos fields_vec; + + // initialization method (num_cells, num_dims) + void initialize(size_t num_gauss_pnts, size_t num_dims, std::vector gauss_pt_states, CommunicationPlan& comm_plan) + { + + for (auto field : gauss_pt_states){ + switch(field){ + case gauss_pt_state::fields: + //if (fields.size() == 0) this->fields = DCArrayKokkos(num_gauss_pnts, "gauss_point_fields"); + if (fields.size() == 0){ + this->fields = MPICArrayKokkos(num_gauss_pnts, "gauss_point_fields"); + this->fields.initialize_comm_plan(comm_plan); + } + break; + case gauss_pt_state::fields_vec: + if (fields_vec.size() == 0){ + this->fields_vec = MPICArrayKokkos(num_gauss_pnts, num_dims, "gauss_point_fields_vec"); + this->fields_vec.initialize_comm_plan(comm_plan); + } + break; + default: + std::cout<<"Desired gauss point state not understood in GaussPoint_t initialize"< + #ifdef HAVE_MPI #include -#include "partition_map.h" +#include "matar.h" -namespace mtr -{ +#include -///////////////////////// -/* CommunicationPlan: Class storing relevant data and functions to perform comms between two different MATAR MPI types. - The object for this class should not be reconstructed if the same comm plan is needed repeatedly; the setup is expensive. - The comms routines such as execute_comms can be called repeatedly to avoid repeated setup of the plan.*/ -///////////////////////// -template -class CommunicationPlan { +using namespace mtr; - // this is manage - using TArray1D = Kokkos::DualView ; - -protected: -public: - - /*forward comms means communicating data to a vector that doesn't have a unique distribution of its global - indices amongst processes from a vector that does have a unique distribution amongst processes. - An example of forward comms in a finite element application would be communicating ghost data from - the vector of local data. - - reverse comms means communicating data to a vector that has a unique distribution of its global - indices amongst processes from a vector that does not have a unique distribution amongst processes. - An example of reverse comms in a finite element application would be communicating force contributions from ghost - indices via summation to the entries of the uniquely owned vector that stores final tallies of forces. - */ - bool reverse_comms_flag; //default is false - - CommunicationPlan(); - - //Copy Constructor - CommunicationPlan(const CommunicationPlan &temp){ - *this = temp; - } +enum class communication_plan_type { + no_communication, + all_to_all_graph +}; + + +struct CommunicationPlan { - CommunicationPlan(bool reverse_comms); + // ======================================================================== + // Metadata for MPI neighbor graph communication + // ======================================================================== - KOKKOS_INLINE_FUNCTION - CommunicationPlan& operator=(const CommunicationPlan& temp); + communication_plan_type comm_type = communication_plan_type::no_communication; - // Deconstructor - virtual KOKKOS_INLINE_FUNCTION - ~CommunicationPlan (); + // MPI world communicator + MPI_Comm mpi_comm_world; + bool has_comm_world = false; + int world_size = -1; - virtual void execute_comms(){} -}; // End of CommunicationPlan + // MPI graph communicator + MPI_Comm mpi_comm_graph; + bool has_comm_graph = false; + + // Number of send and recv ranks + int num_send_ranks; // In MPI language, this is the outdegree of the graph communicator + int num_recv_ranks; // In MPI language, this is the indegree of the graph communicator + // Rank IDs for send and recv ranks + DCArrayKokkos send_rank_ids; // [size: num_send_ranks] Destination rank IDs + DCArrayKokkos recv_rank_ids; // [size: num_recv_ranks] Source rank IDs -// Default constructor -template -CommunicationPlan::CommunicationPlan() { + // recv_weights: Weights on incoming edges (not used here, set to MPI_UNWEIGHTED) + // Could be used to specify communication volume if needed for optimization + int* recv_weights = MPI_UNWEIGHTED; // [size: num_recv_ranks] Weights on incoming edges, set to MPI_UNWEIGHTED if not used + + // send_weights: Weights on outgoing edges (not used here, set to MPI_UNWEIGHTED) + // Could be used to specify communication volume if needed for optimization + int* send_weights = MPI_UNWEIGHTED; // [size: num_send_ranks] Weights on outgoing edges, set to MPI_UNWEIGHTED if not used -} + // info: Hints for optimization (MPI_INFO_NULL means use defaults) + MPI_Info info = MPI_INFO_NULL; + + // reorder: Whether to allow MPI to reorder ranks for optimization (0=no reordering) + // Setting to 0 preserves original rank numbering + // Note: In the future, we may want to allow MPI to reorder ranks for optimization by setting to 1, + // this would allow MPI to reorder the ranks to make them physically closer on the hardware. + // This is a good optimization for large meshes, but will require maps from MPI_comm_world rank IDs to the new reordered rank IDs. + int reorder = 0; -// Overloaded 1D constructor -template -CommunicationPlan::CommunicationPlan(bool reverse_comms) { - reverse_comms_flag = reverse_comms; -} + DRaggedRightArrayKokkos send_indices_; // [size: num_send_ranks, num_items_to_send_per_rank] Indices of items to send to each rank + DRaggedRightArrayKokkos recv_indices_; // [size: num_recv_ranks, num_items_to_recv_per_rank] Indices of items to receive from each rank + DCArrayKokkos send_counts_; // [size: num_send_ranks] Number of items to send to each rank + DCArrayKokkos recv_counts_; // [size: num_recv_ranks] Number of items to receive from each rank + + + DCArrayKokkos send_displs_; // [size: num_send_ranks] Starting index of items to send to each rank + DCArrayKokkos recv_displs_; // [size: num_recv_ranks] Starting index of items to receive from each rank -template -KOKKOS_INLINE_FUNCTION -CommunicationPlan& CommunicationPlan::operator= (const CommunicationPlan& temp) { + int total_send_count; // Total number of items to send + int total_recv_count; // Total number of items to receive + + // ======================================================================== + // CONSTRUCTOR / INITIALIZATION + // ======================================================================== + + CommunicationPlan() + : num_send_ranks(0), num_recv_ranks(0), + has_comm_graph(false) {} + + + // Destructor to free MPI resources + ~CommunicationPlan() { + // Free graph communicator + if (has_comm_graph && mpi_comm_graph != MPI_COMM_NULL) { + MPI_Comm_free(&mpi_comm_graph); + } + } + - // Do nothing if the assignment is of the form x = x - if (this != &temp) { - reverse_comms_flag = temp.reverse_comms_flag; + void initialize(MPI_Comm comm_world){ + this->mpi_comm_world = comm_world; + has_comm_world = true; + MPI_Comm_size(comm_world, &world_size); } - return *this; -} + /** + * @brief Initialize an MPI distributed graph communicator for sparse neighbor communication. + * + * This function creates an MPI "dist graph communicator" tailored to the sparse data exchange + * patterns typical in mesh-based parallel applications. It establishes direct knowledge for MPI + * about which processes (ranks) each process will communicate with. This improves the efficiency + * and clarity of later communication (for example, with MPI_Neighbor_alltoallv). + * + * This function is especially useful when the communication pattern is not all-to-all, but rather + * a sparse subset: for instance, where each process only exchanges data with a few neighbors. + * + * ==== Key Concepts ==== + * - MPI Communicator: An MPI object representing a group of processes that can communicate with each other. + * For context, "MPI_COMM_WORLD" is a communicator including all processes, but a graph communicator + * customizes direct process connections. + * - Rank: Integer ID identifying a process in a communicator. + * - Distributed Graph: MPI can represent communication as a directed sparse graph, with edges from + * this rank to those it needs to send to, and from those it will receive from. + * + * ==== Parameters ==== + * @param num_send_ranks [in] Number of ranks this process will send data to (out-neighbors). + * @param send_rank_ids [in] Array of size num_send_ranks; each entry is the rank of a process to send to. + * @param num_recv_ranks [in] Number of ranks this process will receive data from (in-neighbors). + * @param recv_rank_ids [in] Array of size num_recv_ranks; each entry is the rank of a process to receive from. + * + * ==== Steps ==== + * + * 1. Checks if the basic communicator has been initialized. + * Throws an error if it has not. + * + * 2. Stores the send/receive neighbor counts and rank lists internally. + * Copies the IDs into the internal device-host arrays. + * - send_rank_ids: process IDs that will be destinations for outgoing messages. + * - recv_rank_ids: process IDs that will provide incoming messages. + * + * 3. Calls MPI_Dist_graph_create_adjacent: + * This constructs a new MPI communicator ("mpi_comm_graph") that encodes this process's + * inbound and outbound neighbors. MPI uses this to optimize and route messages directly + * and efficiently during later neighbor collectives. + * + * - Note: The 'recv_weights' and 'send_weights' arguments are set to NULL here; + * this means we are not giving extra weighting or priorities to any connection. + * - The 'reorder' argument (set to 0 in this class) disables rank reordering; + * this ensures the assignment of process ranks is preserved, which is often needed + * for mapping data or results back to physical entities. + * - On return, 'mpi_comm_graph' will allow use of "neighbor" collectives (MPI_Neighbor_alltoall[v], etc.), + * which automatically use the provided topology to send/receive to only neighbors efficiently. + * + * 4. Marks the internal flag indicating that the graph communicator has been set up ("has_comm_graph"). + * + * ==== Example Usage ==== + * Suppose rank 0 will send to ranks 1 and 2, and receive from rank 3 only: + * int send_ranks[2] = {1, 2}; + * int recv_ranks[1] = {3}; + * initialize_graph_communicator(2, send_ranks, 1, recv_ranks); + * + * ==== Why Use This? ==== + * - This avoids the need to do manual pairwise MPI_Send/MPI_Recv in your code, + * and enables the use of neighbor collectives -- concise, scalable, and hard-to-get-wrong. + * - It explicitly tells MPI only about your neighbors, so it can optimize routes and memory. + * - If you have a large number of processes or a mesh/network with only local coupling, + * this approach scales much better than using global/all-to-all communication. + * + * @throws std::runtime_error if the base communicator has not been initialized. + */ + void initialize_graph_communicator(int num_send_ranks, int* send_rank_ids, int num_recv_ranks, int* recv_rank_ids){ + + this->comm_type = communication_plan_type::all_to_all_graph; + // Check if the MPI_COMM_WORLD communicator has been initialized. + if(!has_comm_world){ + throw std::runtime_error("MPI communicator for the world has not been initialized"); + } + + // Store the number of outbound and inbound neighbors + this->num_send_ranks = num_send_ranks; + this->num_recv_ranks = num_recv_ranks; + + // Copy and store send neighbor IDs (out-bound neighbors: where we will send data to) + this->send_rank_ids = DCArrayKokkos(num_send_ranks, "send_rank_ids"); + for(int i = 0; i < num_send_ranks; i++){ + this->send_rank_ids.host(i) = send_rank_ids[i]; + } + this->send_rank_ids.update_device(); + MATAR_FENCE(); -template -KOKKOS_INLINE_FUNCTION -CommunicationPlan::~CommunicationPlan() {} + // Copy and store receive neighbor IDs (in-bound neighbors: where we will receive data from) + this->recv_rank_ids = DCArrayKokkos(num_recv_ranks, "recv_rank_ids"); + for(int i = 0; i < num_recv_ranks; i++){ + this->recv_rank_ids.host(i) = recv_rank_ids[i]; + } + this->recv_rank_ids.update_device(); + MATAR_FENCE(); + + // Create the distributed graph communicator. + // This call links this process to its explicit send and receive neighbors. + // See https://www.open-mpi.org/doc/v4.0/man3/MPI_Dist_graph_create_adjacent.3.php for more details. + MPI_Dist_graph_create_adjacent( + mpi_comm_world, // Existing communicator (usually MPI_COMM_WORLD) + num_recv_ranks, // Number of in-neighbors (recv) + this->recv_rank_ids.host_pointer(), // Array of in-neighbor ranks (who we receive from) + recv_weights, // Edge weights (NULL = unweighted) + num_send_ranks, // Number of out-neighbors (send) + this->send_rank_ids.host_pointer(), // Array of out-neighbor ranks (who we send to) + send_weights, // Edge weights (NULL = unweighted) + info, // Additional info for MPI (not used, set to MPI_INFO_NULL) + reorder, // Allow MPI to reorder ranks for performance (0 disables) + &mpi_comm_graph // [out] New graph communicator + ); -//////////////////////////////////////////////////////////////////////////////// -// End of CommunicationPlan -//////////////////////////////////////////////////////////////////////////////// + // Set the internal flag indicating that we have created the MPI distributed graph communicator. + has_comm_graph = true; + } + + // Useful function for debugging, possibly remove + void verify_graph_communicator(){ + if(!has_comm_graph){ + throw std::runtime_error("MPI graph communicator has not been initialized"); + } + + // ============================================================================ + // Verify the distributed graph communicator + // ============================================================================ + // Query the graph to verify it matches what we specified + int indegree_out, outdegree_out, weighted; + MPI_Dist_graph_neighbors_count(mpi_comm_graph, &indegree_out, &outdegree_out, &weighted); + + // Allocate arrays to receive neighbor information + std::vector sources_out(indegree_out); + std::vector sourceweights_out(indegree_out); + std::vector destinations_out(outdegree_out); + std::vector destweights_out(outdegree_out); + + // Retrieve the actual neighbors from the graph communicator + MPI_Dist_graph_neighbors(mpi_comm_graph, + indegree_out, sources_out.data(), sourceweights_out.data(), + outdegree_out, destinations_out.data(), destweights_out.data()); + + int rank = -1; + MPI_Comm_rank(mpi_comm_world, &rank); + + // Additional verification: Check if the queried values match our input + bool verification_passed = true; + + // Print verification information for each rank sequentially + for (int r = 0; r < world_size; ++r) { + MPI_Barrier(mpi_comm_world); + if (rank == r) { + std::cout << "\n[rank " << rank << "] Graph Communicator Verification:" << std::endl; + std::cout << " Indegree (receives from " << indegree_out << " ranks): "; + for (int i = 0; i < indegree_out; ++i) { + std::cout << sources_out[i] << " "; + } + std::cout << std::endl; + + std::cout << " Outdegree (sends to " << outdegree_out << " ranks): "; + for (int i = 0; i < outdegree_out; ++i) { + std::cout << destinations_out[i] << " "; + } + std::cout << std::endl; + + std::cout << " Weighted: " << (weighted ? "yes" : "no") << std::endl; + } + MPI_Barrier(mpi_comm_world); + } + + // Check if the counts match our stored values + if (indegree_out != num_recv_ranks) { + std::cerr << "[rank " << rank << "] ERROR: indegree mismatch! " + << "Expected " << num_recv_ranks << ", got " << indegree_out << std::endl; + verification_passed = false; + } + if (outdegree_out != num_send_ranks) { + std::cerr << "[rank " << rank << "] ERROR: outdegree mismatch! " + << "Expected " << num_send_ranks << ", got " << outdegree_out << std::endl; + verification_passed = false; + } + + // Check if source ranks match (build set from our stored recv_rank_ids) + std::set sources_set_in; + for (int i = 0; i < num_recv_ranks; ++i) { + sources_set_in.insert(recv_rank_ids.host(i)); + } + std::set sources_set_out(sources_out.begin(), sources_out.end()); + if (sources_set_in != sources_set_out) { + std::cerr << "[rank " << rank << "] ERROR: source ranks mismatch!" << std::endl; + verification_passed = false; + } + + // Check if destination ranks match (build set from our stored send_rank_ids) + std::set dests_set_in; + for (int i = 0; i < num_send_ranks; ++i) { + dests_set_in.insert(send_rank_ids.host(i)); + } + std::set dests_set_out(destinations_out.begin(), destinations_out.end()); + if (dests_set_in != dests_set_out) { + std::cerr << "[rank " << rank << "] ERROR: destination ranks mismatch!" << std::endl; + verification_passed = false; + } + + // Global verification check + int local_passed = verification_passed ? 1 : 0; + int global_passed = 0; + MPI_Allreduce(&local_passed, &global_passed, 1, MPI_INT, MPI_MIN, mpi_comm_world); + MPI_Barrier(mpi_comm_world); + if (rank == 0) { + if (global_passed) { + std::cout << "\n✓ Graph communicator verification PASSED on all ranks\n" << std::endl; + } else { + std::cout << "\n✗ Graph communicator verification FAILED on one or more ranks\n" << std::endl; + } + } + MPI_Barrier(mpi_comm_world); + } -} // end namespace + // Setup send/receive metadata + void setup_send_recv(DRaggedRightArrayKokkos &rank_send_ids, DRaggedRightArrayKokkos &rank_recv_ids){ + + this->send_indices_ = rank_send_ids; // indices of element data to send to each rank + this->recv_indices_ = rank_recv_ids; // indices of element data to receive from each rank + + // Setup send data + this->send_counts_ = DCArrayKokkos(num_send_ranks, "send_counts"); + this->total_send_count = 0; + for(int i = 0; i < num_send_ranks; i++){ + this->send_counts_.host(i) = rank_send_ids.stride_host(i); + this->total_send_count += this->send_counts_.host(i); + } + this->send_counts_.update_device(); + + this->send_displs_ = DCArrayKokkos(num_send_ranks, "send_displs"); + for(int i = 0; i < num_send_ranks; i++){ + this->send_displs_.host(i) = 0; + for(int j = 0; j < i; j++){ + this->send_displs_.host(i) += this->send_counts_.host(j); + } + } + this->send_displs_.update_device(); + + // Setup recv data + this->recv_counts_ = DCArrayKokkos(num_recv_ranks, "recv_counts"); + this->total_recv_count = 0; + for(int i = 0; i < num_recv_ranks; i++){ + this->recv_counts_.host(i) = rank_recv_ids.stride_host(i); + this->total_recv_count += this->recv_counts_.host(i); + } + this->recv_counts_.update_device(); + + this->recv_displs_ = DCArrayKokkos(num_recv_ranks, "recv_displs"); + for(int i = 0; i < num_recv_ranks; i++){ + this->recv_displs_.host(i) = 0; + for(int j = 0; j < i; j++){ + this->recv_displs_.host(i) += this->recv_counts_.host(j); + } + } + this->recv_displs_.update_device(); + MATAR_FENCE(); + } + + // Useful function for debugging, possibly remove + void verify_send_recv(){ + + if(!has_comm_graph){ + throw std::runtime_error("Graph communicator has not been initialized"); + } + + int rank = -1; + MPI_Comm_rank(mpi_comm_world, &rank); + + bool local_verification_passed = true; + + // ============================================================================ + // Local Verification: Check consistency of counts and displacements + // ============================================================================ + + // Verify send counts and displacements + int computed_total_send = 0; + for(int i = 0; i < num_send_ranks; i++){ + computed_total_send += send_counts_.host(i); + + // Verify displacements are consistent + int expected_displs = 0; + for(int j = 0; j < i; j++){ + expected_displs += send_counts_.host(j); + } + if(send_displs_.host(i) != expected_displs){ + std::cerr << "[rank " << rank << "] ERROR: send_displs[" << i << "] mismatch! " + << "Expected " << expected_displs << ", got " << send_displs_.host(i) << std::endl; + local_verification_passed = false; + } + } + + // Verify total send count + if(computed_total_send != total_send_count){ + std::cerr << "[rank " << rank << "] ERROR: total_send_count mismatch! " + << "Expected " << computed_total_send << ", got " << total_send_count << std::endl; + local_verification_passed = false; + } + + // Verify recv counts and displacements + int computed_total_recv = 0; + for(int i = 0; i < num_recv_ranks; i++){ + computed_total_recv += recv_counts_.host(i); + + // Verify displacements are consistent + int expected_displs = 0; + for(int j = 0; j < i; j++){ + expected_displs += recv_counts_.host(j); + } + if(recv_displs_.host(i) != expected_displs){ + std::cerr << "[rank " << rank << "] ERROR: recv_displs[" << i << "] mismatch! " + << "Expected " << expected_displs << ", got " << recv_displs_.host(i) << std::endl; + local_verification_passed = false; + } + } + + // Verify total recv count + if(computed_total_recv != total_recv_count){ + std::cerr << "[rank " << rank << "] ERROR: total_recv_count mismatch! " + << "Expected " << computed_total_recv << ", got " << total_recv_count << std::endl; + local_verification_passed = false; + } + + // Verify send indices are within bounds (basic sanity check) + for(int i = 0; i < num_send_ranks; i++){ + for(int j = 0; j < send_indices_.stride_host(i); j++){ + int idx = send_indices_.host(i, j); + if(idx < 0){ + std::cerr << "[rank " << rank << "] ERROR: negative send index at rank " << i + << ", index " << j << ": " << idx << std::endl; + local_verification_passed = false; + } + } + } + + // Verify recv indices are within bounds (basic sanity check) + for(int i = 0; i < num_recv_ranks; i++){ + for(int j = 0; j < recv_indices_.stride_host(i); j++){ + int idx = recv_indices_.host(i, j); + if(idx < 0){ + std::cerr << "[rank " << rank << "] ERROR: negative recv index at rank " << i + << ", index " << j << ": " << idx << std::endl; + local_verification_passed = false; + } + } + } + + // ============================================================================ + // Print local verification information for each rank sequentially + // ============================================================================ + for (int r = 0; r < world_size; ++r) { + MPI_Barrier(mpi_comm_world); + if (rank == r) { + std::cout << "\n[rank " << rank << "] Send/Recv Communication Plan Verification:" << std::endl; + + std::cout << " Send Configuration:" << std::endl; + std::cout << " - Num send ranks: " << num_send_ranks << std::endl; + std::cout << " - Total send count: " << total_send_count << std::endl; + std::cout << " - Send counts per rank: "; + for (int i = 0; i < num_send_ranks; ++i) { + std::cout << send_counts_.host(i) << " "; + } + std::cout << std::endl; + std::cout << " - Send displacements: "; + for (int i = 0; i < num_send_ranks; ++i) { + std::cout << send_displs_.host(i) << " "; + } + std::cout << std::endl; + + std::cout << " Recv Configuration:" << std::endl; + std::cout << " - Num recv ranks: " << num_recv_ranks << std::endl; + std::cout << " - Total recv count: " << total_recv_count << std::endl; + std::cout << " - Recv counts per rank: "; + for (int i = 0; i < num_recv_ranks; ++i) { + std::cout << recv_counts_.host(i) << " "; + } + std::cout << std::endl; + std::cout << " - Recv displacements: "; + for (int i = 0; i < num_recv_ranks; ++i) { + std::cout << recv_displs_.host(i) << " "; + } + std::cout << std::endl; + } + MPI_Barrier(mpi_comm_world); + } + + // ============================================================================ + // Global Verification: Use MPI to verify consistency across ranks + // ============================================================================ + int local_passed = local_verification_passed ? 1 : 0; + int global_passed = 0; + MPI_Allreduce(&local_passed, &global_passed, 1, MPI_INT, MPI_MIN, mpi_comm_world); + MPI_Barrier(mpi_comm_world); + + if (rank == 0) { + if (global_passed) { + std::cout << "\n✓ Send/Recv communication plan verification PASSED on all ranks\n" << std::endl; + } else { + std::cout << "\n✗ Send/Recv communication plan verification FAILED on one or more ranks\n" << std::endl; + } + } + MPI_Barrier(mpi_comm_world); + + if(!global_passed){ + throw std::runtime_error("Send/Recv communication plan verification failed"); + } + } +}; // End of CommunicationPlan -#endif // end if have MPI +#endif // end if HAVE_MPI +#endif // end if COMMUNICATION_PLAN_H -#endif // COMMUNICATION_PLAN_H diff --git a/src/include/communication_plan_old.h b/src/include/communication_plan_old.h new file mode 100644 index 00000000..302cb119 --- /dev/null +++ b/src/include/communication_plan_old.h @@ -0,0 +1,135 @@ +#ifndef COMMUNICATION_PLAN_H +#define COMMUNICATION_PLAN_H +/********************************************************************************************** + © 2020. Triad National Security, LLC. All rights reserved. + This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos + National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S. + Department of Energy/National Nuclear Security Administration. All rights in the program are + reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear + Security Administration. The Government is granted for itself and others acting on its behalf a + nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare + derivative works, distribute copies to the public, perform publicly and display publicly, and + to permit others to do so. + This program is open source under the BSD-3 License. + Redistribution and use in source and binary forms, with or without modification, are permitted + provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, this list of + conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, this list of + conditions and the following disclaimer in the documentation and/or other materials + provided with the distribution. + + 3. Neither the name of the copyright holder nor the names of its contributors may be used + to endorse or promote products derived from this software without specific prior + written permission. + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + **********************************************************************************************/ + +#include "host_types.h" +#include "kokkos_types.h" +#include +#ifdef HAVE_MPI +#include +#include "partition_map.h" + +namespace mtr +{ + +///////////////////////// +/* CommunicationPlan: Class storing relevant data and functions to perform comms between two different MATAR MPI types. + The object for this class should not be reconstructed if the same comm plan is needed repeatedly; the setup is expensive. + The comms routines such as execute_comms can be called repeatedly to avoid repeated setup of the plan.*/ +///////////////////////// +template +class CommunicationPlan { + + // this is manage + using TArray1D = Kokkos::DualView ; + +protected: + +public: + + /*forward comms means communicating data to a vector that doesn't have a unique distribution of its global + indices amongst processes from a vector that does have a unique distribution amongst processes. + An example of forward comms in a finite element application would be communicating ghost data from + the vector of local data. + + reverse comms means communicating data to a vector that has a unique distribution of its global + indices amongst processes from a vector that does not have a unique distribution amongst processes. + An example of reverse comms in a finite element application would be communicating force contributions from ghost + indices via summation to the entries of the uniquely owned vector that stores final tallies of forces. + */ + bool reverse_comms_flag; //default is false + + CommunicationPlan(); + + //Copy Constructor + CommunicationPlan(const CommunicationPlan &temp){ + *this = temp; + } + + CommunicationPlan(bool reverse_comms); + + KOKKOS_INLINE_FUNCTION + CommunicationPlan& operator=(const CommunicationPlan& temp); + + // Deconstructor + virtual KOKKOS_INLINE_FUNCTION + ~CommunicationPlan (); + + virtual void execute_comms(){} +}; // End of CommunicationPlan + + +// Default constructor +template +CommunicationPlan::CommunicationPlan() { + +} + +// Overloaded 1D constructor +template +CommunicationPlan::CommunicationPlan(bool reverse_comms) { + reverse_comms_flag = reverse_comms; +} + + +template +KOKKOS_INLINE_FUNCTION +CommunicationPlan& CommunicationPlan::operator= (const CommunicationPlan& temp) { + + // Do nothing if the assignment is of the form x = x + if (this != &temp) { + reverse_comms_flag = temp.reverse_comms_flag; + } + + return *this; +} + +template +KOKKOS_INLINE_FUNCTION +CommunicationPlan::~CommunicationPlan() {} + +//////////////////////////////////////////////////////////////////////////////// +// End of CommunicationPlan +//////////////////////////////////////////////////////////////////////////////// + +} // end namespace + +#endif // end if have MPI + +#endif // COMMUNICATION_PLAN_H + diff --git a/src/include/mapped_mpi_types.h b/src/include/mapped_mpi_types.h index 6d5d18d3..ed690ca6 100644 --- a/src/include/mapped_mpi_types.h +++ b/src/include/mapped_mpi_types.h @@ -45,7 +45,6 @@ #include #include #include "partition_map.h" -#include "communication_plan.h" namespace mtr { diff --git a/src/include/mpi_types.h b/src/include/mpi_types.h index b10a57fc..5f83265b 100644 --- a/src/include/mpi_types.h +++ b/src/include/mpi_types.h @@ -1,121 +1,148 @@ -#ifndef MPI_TYPES_H -#define MPI_TYPES_H -/********************************************************************************************** - © 2020. Triad National Security, LLC. All rights reserved. - This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos - National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S. - Department of Energy/National Nuclear Security Administration. All rights in the program are - reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear - Security Administration. The Government is granted for itself and others acting on its behalf a - nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare - derivative works, distribute copies to the public, perform publicly and display publicly, and - to permit others to do so. - This program is open source under the BSD-3 License. - Redistribution and use in source and binary forms, with or without modification, are permitted - provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, this list of - conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright notice, this list of - conditions and the following disclaimer in the documentation and/or other materials - provided with the distribution. - - 3. Neither the name of the copyright holder nor the names of its contributors may be used - to endorse or promote products derived from this software without specific prior - written permission. - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS - IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - **********************************************************************************************/ - -#include "host_types.h" -#include "kokkos_types.h" -#include +#ifndef MPICARRAYKOKKOS_H +#define MPICARRAYKOKKOS_H + #ifdef HAVE_MPI #include +#include "matar.h" +#include "communication_plan.h" namespace mtr { +// Type trait to map C++ types to MPI_Datatype +template +struct mpi_type_map { + static MPI_Datatype value() { + static_assert(sizeof(T) == 0, "Unsupported type for MPI communication"); + return MPI_DATATYPE_NULL; + } +}; + +// Specializations for common types +template <> +struct mpi_type_map { + static MPI_Datatype value() { return MPI_INT; } +}; + +template <> +struct mpi_type_map { + static MPI_Datatype value() { return MPI_LONG; } +}; + +template <> +struct mpi_type_map { + static MPI_Datatype value() { return MPI_LONG_LONG; } +}; + +template <> +struct mpi_type_map { + static MPI_Datatype value() { return MPI_UNSIGNED; } +}; + +template <> +struct mpi_type_map { + static MPI_Datatype value() { return MPI_UNSIGNED_LONG; } +}; + +template <> +struct mpi_type_map { + static MPI_Datatype value() { return MPI_FLOAT; } +}; + +template <> +struct mpi_type_map { + static MPI_Datatype value() { return MPI_DOUBLE; } +}; + +template <> +struct mpi_type_map { + static MPI_Datatype value() { return MPI_CHAR; } +}; + +template <> +struct mpi_type_map { + static MPI_Datatype value() { return MPI_UNSIGNED_CHAR; } +}; + +template <> +struct mpi_type_map { + static MPI_Datatype value() { return MPI_C_BOOL; } +}; + + ///////////////////////// -// MPIArrayKokkos: Dual type for managing distributed data on both CPU and GPU. +// MPICArrayKokkos: Type for managing distributed data on both CPU and GPU. ///////////////////////// template -class MPIArrayKokkos { +class MPICArrayKokkos { + + // Dual view for managing data on both CPU and GPU + DCArrayKokkos this_array_; - // this is manage - using TArray1D = Kokkos::DualView ; + DCArrayKokkos send_buffer_; + DCArrayKokkos recv_buffer_; protected: - size_t dims_[7]; - size_t length_; - size_t order_; // tensor order (rank) - int mpi_recv_rank_; - int mpi_tag_; + size_t dims_[7] = {0,0,0,0,0,0,0}; + size_t length_ = 0; + size_t order_ = 0; // tensor order (rank) + MPI_Comm mpi_comm_; MPI_Status mpi_status_; MPI_Datatype mpi_datatype_; MPI_Request mpi_request_; - TArray1D this_array_; - - void set_mpi_type(); -public: - // Data member to access host view - ViewCArray host; - - MPIArrayKokkos(); - MPIArrayKokkos(size_t dim0, const std::string& tag_string = DEFAULTSTRINGARRAY); + // --- Ghost Communication Support --- + CommunicationPlan* comm_plan_ = NULL; // Pointer to shared communication plan - MPIArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY); - MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY); - - MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2, - size_t dim3, const std::string& tag_string = DEFAULTSTRINGARRAY); + DCArrayKokkos send_counts_; // [size: num_send_ranks] Number of items to send to each rank + DCArrayKokkos recv_counts_; // [size: num_recv_ranks] Number of items to receive from each rank + DCArrayKokkos send_displs_; // [size: num_send_ranks] Starting index of items to send to each rank + DCArrayKokkos recv_displs_; // [size: num_recv_ranks] Starting index of items to receive from each rank - MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2, - size_t dim3, size_t dim4, const std::string& tag_string = DEFAULTSTRINGARRAY); + size_t stride_; // [size: num_dims] Number of contiguous values per first index element - MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2, - size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string = DEFAULTSTRINGARRAY); - MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2, - size_t dim3, size_t dim4, size_t dim5, - size_t dim6, const std::string& tag_string = DEFAULTSTRINGARRAY); + DRaggedRightArrayKokkos send_indices_; // [size: num_send_ranks, num_items_to_send_by_rank] Indices of items to send to each rank + DRaggedRightArrayKokkos recv_indices_; // [size: num_recv_ranks, num_items_to_recv_by_rank] Indices of items to receive from each rank + - // These functions can setup the data needed for halo send/receives - // Not necessary for standard MPI comms - void mpi_setup(); + size_t num_owned_; // Number of owned items (nodes/elements) + size_t num_ghost_; // Number of ghost items (nodes/elements) - void mpi_setup(int recv_rank); +public: + // Data member to access host view (initialized as pointer to this_array_.host_pointer()) + ViewCArray host; - void mpi_setup(int recv_rank, int tag); - void mpi_setup(int recv_rank, int tag, MPI_Comm comm); + // Note, consider this for sending blocks without dealing with stride_ + // MPI_Datatype vector_type; + // MPI_Type_contiguous(stride_, mpi_type_map::value(), &vector_type); + // MPI_Type_commit(&vector_type); - void mpi_set_rank(int recv_rank); + MPICArrayKokkos(); + + MPICArrayKokkos(size_t dim0, const std::string& tag_string = DEFAULTSTRINGARRAY); - void mpi_set_tag(int tag); + MPICArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY); - void mpi_set_comm(MPI_Comm comm); + MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY); - int get_rank(); + MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, + size_t dim3, const std::string& tag_string = DEFAULTSTRINGARRAY); - int get_tag(); + MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, + size_t dim3, size_t dim4, const std::string& tag_string = DEFAULTSTRINGARRAY); - MPI_Comm get_comm(); + MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, + size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string = DEFAULTSTRINGARRAY); + MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, + size_t dim3, size_t dim4, size_t dim5, + size_t dim6, const std::string& tag_string = DEFAULTSTRINGARRAY); + KOKKOS_INLINE_FUNCTION T& operator()(size_t i) const; @@ -140,7 +167,52 @@ class MPIArrayKokkos { size_t n, size_t o) const; KOKKOS_INLINE_FUNCTION - MPIArrayKokkos& operator=(const MPIArrayKokkos& temp); + MPICArrayKokkos& operator=(const MPICArrayKokkos& temp); + + + // Method to set comm plan for halo communication + void initialize_comm_plan(CommunicationPlan& comm_plan){ + comm_plan_ = &comm_plan; + + if(comm_plan_->comm_type == communication_plan_type::no_communication){ + return; + } + + size_t send_size = comm_plan_->total_send_count * stride_; + size_t recv_size = comm_plan_->total_recv_count * stride_; + + if (send_size > 0) { + send_buffer_ = DCArrayKokkos(send_size, "send_buffer"); + } + if (recv_size > 0) { + recv_buffer_ = DCArrayKokkos(recv_size, "recv_buffer"); + } + + if (comm_plan_->num_send_ranks > 0) { + send_counts_ = DCArrayKokkos(comm_plan_->num_send_ranks, "send_counts"); + send_displs_ = DCArrayKokkos(comm_plan_->num_send_ranks, "send_displs"); + + for(int i = 0; i < comm_plan_->num_send_ranks; i++){ + send_counts_.host(i) = comm_plan_->send_counts_.host(i) * stride_; + send_displs_.host(i) = comm_plan_->send_displs_.host(i) * stride_; + } + send_counts_.update_device(); + send_displs_.update_device(); + } + + if (comm_plan_->num_recv_ranks > 0) { + recv_counts_ = DCArrayKokkos(comm_plan_->num_recv_ranks, "recv_counts"); + recv_displs_ = DCArrayKokkos(comm_plan_->num_recv_ranks, "recv_displs"); + + for(int i = 0; i < comm_plan_->num_recv_ranks; i++){ + recv_counts_.host(i) = comm_plan_->recv_counts_.host(i) * stride_; + recv_displs_.host(i) = comm_plan_->recv_displs_.host(i) * stride_; + } + recv_counts_.update_device(); + recv_displs_.update_device(); + } + }; + // GPU Method // Method that returns size @@ -168,7 +240,7 @@ class MPIArrayKokkos { // Method returns kokkos dual view KOKKOS_INLINE_FUNCTION - TArray1D get_kokkos_dual_view() const; + Kokkos::DualView get_kokkos_dual_view() const; // Method that update host view void update_host(); @@ -176,167 +248,188 @@ class MPIArrayKokkos { // Method that update device view void update_device(); - // MPI send wrapper - void send(size_t count, int dest, int tag, MPI_Comm comm); - - // MPI recieve wrapper - void recv(size_t count, int dest, int tag, MPI_Comm comm); - - // MPI broadcast wrapper - void broadcast(size_t count, int root, MPI_Comm comm); - - // MPI scatter wrapper - void scatter(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm); - - // MPI gather wrapper - void gather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm); - - // MPI allgather wrapper - void allgather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, MPI_Comm comm); - - // MPI send wrapper - void isend(size_t count, int dest, int tag, MPI_Comm comm); - - // MPI recieve wrapper - void irecv(size_t count, int dest, int tag, MPI_Comm comm); - - // MPI wait wrapper for sender - void wait_send(); - - // MPI wait wrapper for receiver - void wait_recv(); - - // MPI barrier wrapper - //void barrier(MPI_Comm comm); - - // MPI send wrapper - void halo_send(); - - // MPI recieve wrapper - void halo_recv(); - - // MPI send wrapper - void halo_isend(); - - // MPI recieve wrapper - void halo_irecv(); + // Method that builds the send buffer, note, this has to be ordered + // Such that all the boundary elements going to a given rank are contiguous in the send buffer. + void fill_send_buffer(){ + + // Copy this_array_ to the host + this_array_.update_host(); + MATAR_FENCE(); + + size_t send_idx = 0; + for(int i = 0; i < comm_plan_->num_send_ranks; i++){ + for(int j = 0; j < comm_plan_->send_counts_.host(i); j++){ + size_t src_idx = comm_plan_->send_indices_.host(i, j); // index of the element to send + + // Copy all values associated with this element (handles multi-dimensional arrays) + for(size_t k = 0; k < stride_; k++){ + send_buffer_.host(send_idx + k) = this_array_.host_pointer()[src_idx * stride_ + k]; + } + send_idx += stride_; + } + } + }; + + // Method that copies the recv buffer into the this_array + void copy_recv_buffer(){ + + size_t recv_idx = 0; + for(int i = 0; i < comm_plan_->num_recv_ranks; i++){ + for(int j = 0; j < comm_plan_->recv_counts_.host(i); j++){ + size_t dest_idx = comm_plan_->recv_indices_.host(i, j); + + // Copy all values associated with this element (handles multi-dimensional arrays) + for(size_t k = 0; k < stride_; k++){ + this_array_.host_pointer()[dest_idx * stride_ + k] = recv_buffer_.host(recv_idx + k); + } + + recv_idx += stride_; + } + } + }; + + + // Note: This "may" be needed, im not sure. Currently, it works.... + // Use nullptr for empty arrays to avoid accessing element 0 of 0-sized array (undefined behavior) + // T* send_buf_ptr = (send_buffer_.size() > 0) ? &send_buffer_.host(0) : nullptr; + // T* recv_buf_ptr = (recv_buffer_.size() > 0) ? &recv_buffer_.host(0) : nullptr; + // int* send_cnt_ptr = (comm_plan_->num_send_ranks > 0) ? &comm_plan_->send_counts_.host(0) : nullptr; + // int* send_dsp_ptr = (comm_plan_->num_send_ranks > 0) ? &comm_plan_->send_displs_.host(0) : nullptr; + // int* recv_cnt_ptr = (comm_plan_->num_recv_ranks > 0) ? &comm_plan_->recv_counts_.host(0) : nullptr; + // int* recv_dsp_ptr = (comm_plan_->num_recv_ranks > 0) ? &comm_plan_->recv_displs_.host(0) : nullptr; + + // Method that communicates the data between the ranks + // NOTE: This is a blocking communication operation, + // if you want to use non-blocking communication, you can use the following: MPI_Ineighbor_alltoallv + + // TODO: Replace this with persistent communicator: + // MPI_Request req; + + // // Create persistent operation ONCE + // MPI_Neighbor_alltoallv_init( + // sendbuf, sendcounts, sdispls, mpi_type_map::value(), + // recvbuf, recvcounts, rdispls, mpi_type_map::value(), + // comm_plan_->mpi_comm_graph, + // MPI_INFO_NULL, + // &req); + + // // Then inside time step loop: + // MPI_Start(&req); + // // modify sendbuf in-place as needed + // MPI_Wait(&req); + + void communicate(){ + + fill_send_buffer(); + + MPI_Neighbor_alltoallv( + send_buffer_.host_pointer(), + send_counts_.host_pointer(), + send_displs_.host_pointer(), + mpi_type_map::value(), // MPI_TYPE + recv_buffer_.host_pointer(), + recv_counts_.host_pointer(), + recv_displs_.host_pointer(), + mpi_type_map::value(), // MPI_TYPE + comm_plan_->mpi_comm_graph); + + copy_recv_buffer(); + this_array_.update_device(); + MATAR_FENCE(); + }; + + void set_values(const T& value){ + this_array_.set_values(value); + }; // Deconstructor virtual KOKKOS_INLINE_FUNCTION - ~MPIArrayKokkos (); -}; // End of MPIArrayKokkos - + ~MPICArrayKokkos (); +}; // End of MPIDArrayKokkos // Default constructor template -MPIArrayKokkos::MPIArrayKokkos() { - length_ = order_ = 0; - for (int i = 0; i < 7; i++) { - dims_[i] = 0; - } +MPICArrayKokkos::MPICArrayKokkos() + : this_array_(), stride_(1), length_(0), order_(0) { + for (int i = 0; i < 7; i++) { + dims_[i] = 0; + } } // Overloaded 1D constructor template -MPIArrayKokkos::MPIArrayKokkos(size_t dim0, const std::string& tag_string) { - +MPICArrayKokkos::MPICArrayKokkos(size_t dim0, const std::string& tag_string) + : stride_(1), length_(dim0), order_(1) { dims_[0] = dim0; - order_ = 1; - length_ = dim0; - this_array_ = TArray1D(tag_string, length_); - // Create host ViewCArray - host = ViewCArray (this_array_.h_view.data(), dim0); - set_mpi_type(); + this_array_ = DCArrayKokkos(dim0, tag_string); + host = ViewCArray (this_array_.host_pointer(), dim0); } // Overloaded 2D constructor template -MPIArrayKokkos::MPIArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string) { - +MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string) + : stride_(dim1), length_(dim0 * dim1), order_(2) { dims_[0] = dim0; dims_[1] = dim1; - order_ = 2; - length_ = (dim0 * dim1); - this_array_ = TArray1D(tag_string, length_); - // Create host ViewCArray - host = ViewCArray (this_array_.h_view.data(), dim0, dim1); - set_mpi_type(); + + this_array_ = DCArrayKokkos(dim0, dim1, tag_string); + host = ViewCArray (this_array_.host_pointer(), dim0, dim1); } +// Overloaded 3D constructor template -MPIArrayKokkos::MPIArrayKokkos(size_t dim0, size_t dim1, - size_t dim2, const std::string& tag_string) { - +MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string) + : stride_(dim1 * dim2), length_(dim0 * dim1 * dim2), order_(3) { dims_[0] = dim0; dims_[1] = dim1; dims_[2] = dim2; - order_ = 3; - length_ = (dim0 * dim1 * dim2); - this_array_ = TArray1D(tag_string, length_); - // Create host ViewCArray - host = ViewCArray (this_array_.h_view.data(), dim0, dim1, dim2); - set_mpi_type(); + this_array_ = DCArrayKokkos(dim0, dim1, dim2, tag_string); + host = ViewCArray (this_array_.host_pointer(), dim0, dim1, dim2); } +// Overloaded 4D constructor template -MPIArrayKokkos::MPIArrayKokkos(size_t dim0, size_t dim1, - size_t dim2, size_t dim3, const std::string& tag_string) { - +MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, const std::string& tag_string) + : stride_(dim1 * dim2 * dim3), length_(dim0 * dim1 * dim2 * dim3), order_(4) { dims_[0] = dim0; dims_[1] = dim1; dims_[2] = dim2; dims_[3] = dim3; - order_ = 4; - length_ = (dim0 * dim1 * dim2 * dim3); - this_array_ = TArray1D(tag_string, length_); - // Create host ViewCArray - host = ViewCArray (this_array_.h_view.data(), dim0, dim1, dim2, dim3); - set_mpi_type(); + this_array_ = DCArrayKokkos(dim0, dim1, dim2, dim3, tag_string); + host = ViewCArray (this_array_.host_pointer(), dim0, dim1, dim2, dim3); } +// Overloaded 5D constructor template -MPIArrayKokkos::MPIArrayKokkos(size_t dim0, size_t dim1, - size_t dim2, size_t dim3, - size_t dim4, const std::string& tag_string) { - +MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, const std::string& tag_string) + : stride_(dim1 * dim2 * dim3 * dim4), length_(dim0 * dim1 * dim2 * dim3 * dim4), order_(5) { dims_[0] = dim0; dims_[1] = dim1; dims_[2] = dim2; dims_[3] = dim3; dims_[4] = dim4; - order_ = 5; - length_ = (dim0 * dim1 * dim2 * dim3 * dim4); - this_array_ = TArray1D(tag_string, length_); - // Create host ViewCArray - host = ViewCArray (this_array_.h_view.data(), dim0, dim1, dim2, dim3, dim4); - set_mpi_type(); + this_array_ = DCArrayKokkos(dim0, dim1, dim2, dim3, dim4, tag_string); + host = ViewCArray (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4); } +// Overloaded 6D constructor template -MPIArrayKokkos::MPIArrayKokkos(size_t dim0, size_t dim1, - size_t dim2, size_t dim3, - size_t dim4, size_t dim5, const std::string& tag_string) { - +MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string) + : stride_(dim1 * dim2 * dim3 * dim4 * dim5), length_(dim0 * dim1 * dim2 * dim3 * dim4 * dim5), order_(6) { dims_[0] = dim0; dims_[1] = dim1; dims_[2] = dim2; dims_[3] = dim3; dims_[4] = dim4; dims_[5] = dim5; - order_ = 6; - length_ = (dim0 * dim1 * dim2 * dim3 * dim4 * dim5); - this_array_ = TArray1D(tag_string, length_); - // Create host ViewCArray - host = ViewCArray (this_array_.h_view.data(), dim0, dim1, dim2, dim3, dim4, dim5); - set_mpi_type(); + this_array_ = DCArrayKokkos(dim0, dim1, dim2, dim3, dim4, dim5, tag_string); + host = ViewCArray (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4, dim5); } +// Overloaded 7D constructor template -MPIArrayKokkos::MPIArrayKokkos(size_t dim0, size_t dim1, - size_t dim2, size_t dim3, - size_t dim4, size_t dim5, - size_t dim6, const std::string& tag_string) { - +MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, size_t dim6, const std::string& tag_string) + : stride_(dim1 * dim2 * dim3 * dim4 * dim5 * dim6), length_(dim0 * dim1 * dim2 * dim3 * dim4 * dim5 * dim6), order_(7) { dims_[0] = dim0; dims_[1] = dim1; dims_[2] = dim2; @@ -344,441 +437,192 @@ MPIArrayKokkos::MPIArrayKokkos(size_t dim0, siz dims_[4] = dim4; dims_[5] = dim5; dims_[6] = dim6; - order_ = 7; - length_ = (dim0 * dim1 * dim2 * dim3 * dim4 * dim5 * dim6); - this_array_ = TArray1D(tag_string, length_); - // Create host ViewCArray - host = ViewCArray (this_array_.h_view.data(), dim0, dim1, dim2, dim3, dim4, dim5, dim6); - set_mpi_type(); + this_array_ = DCArrayKokkos(dim0, dim1, dim2, dim3, dim4, dim5, dim6, tag_string); + host = ViewCArray (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4, dim5, dim6); } -template -void MPIArrayKokkos::set_mpi_type() { - if (typeid(T).name() == typeid(bool).name()) { - mpi_datatype_ = MPI_C_BOOL; - } - else if (typeid(T).name() == typeid(int).name()) { - mpi_datatype_ = MPI_INT; - } - else if (typeid(T).name() == typeid(long int).name()) { - mpi_datatype_ = MPI_LONG; - } - else if (typeid(T).name() == typeid(long long int).name()) { - mpi_datatype_ = MPI_LONG_LONG_INT; - } - else if (typeid(T).name() == typeid(float).name()) { - mpi_datatype_ = MPI_FLOAT; - } - else if (typeid(T).name() == typeid(double).name()) { - mpi_datatype_ = MPI_DOUBLE; - } - else { - printf("Your entered MPIArrayKokkos type is not a supported type for MPI communications and is being set to int\n"); - mpi_datatype_ = MPI_INT; - } -} template KOKKOS_INLINE_FUNCTION -T& MPIArrayKokkos::operator()(size_t i) const { - assert(order_ == 1 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 1D!"); - assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 1D!"); - return this_array_.d_view(i); +T& MPICArrayKokkos::operator()(size_t i) const { + assert(order_ == 1 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 1D!"); + assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 1D!"); + return this_array_(i); } template KOKKOS_INLINE_FUNCTION -T& MPIArrayKokkos::operator()(size_t i, size_t j) const { - assert(order_ == 2 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 2D!"); - assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 2D!"); - assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 2D!"); - return this_array_.d_view(j + (i * dims_[1])); +T& MPICArrayKokkos::operator()(size_t i, size_t j) const { + assert(order_ == 2 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 2D!"); + assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 2D!"); + assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 2D!"); + return this_array_(i, j); } template KOKKOS_INLINE_FUNCTION -T& MPIArrayKokkos::operator()(size_t i, size_t j, size_t k) const { - assert(order_ == 3 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 3D!"); - assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 3D!"); - assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 3D!"); - assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 3D!"); - return this_array_.d_view(k + (j * dims_[2]) - + (i * dims_[2] * dims_[1])); +T& MPICArrayKokkos::operator()(size_t i, size_t j, size_t k) const { + assert(order_ == 3 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 3D!"); + assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 3D!"); + assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 3D!"); + assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 3D!"); + return this_array_(i, j, k); } template KOKKOS_INLINE_FUNCTION -T& MPIArrayKokkos::operator()(size_t i, size_t j, size_t k, size_t l) const { - assert(order_ == 4 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 4D!"); - assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 4D!"); - assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 4D!"); - assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 4D!"); - assert(l >= 0 && l < dims_[3] && "l is out of bounds in MPIArrayKokkos 4D!"); - return this_array_.d_view(l + (k * dims_[3]) - + (j * dims_[3] * dims_[2]) - + (i * dims_[3] * dims_[2] * dims_[1])); +T& MPICArrayKokkos::operator()(size_t i, size_t j, size_t k, size_t l) const { + assert(order_ == 4 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 4D!"); + assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 4D!"); + assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 4D!"); + assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 4D!"); + assert(l < dims_[3] && "l is out of bounds in MPICArrayKokkos 4D!"); + return this_array_(i, j, k, l); } template KOKKOS_INLINE_FUNCTION -T& MPIArrayKokkos::operator()(size_t i, size_t j, size_t k, size_t l, - size_t m) const { - assert(order_ == 5 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 5D!"); - assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 5D!"); - assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 5D!"); - assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 5D!"); - assert(l >= 0 && l < dims_[3] && "l is out of bounds in MPIArrayKokkos 5D!"); - assert(m >= 0 && m < dims_[4] && "m is out of bounds in MPIArrayKokkos 5D!"); - return this_array_.d_view(m + (l * dims_[4]) - + (k * dims_[4] * dims_[3]) - + (j * dims_[4] * dims_[3] * dims_[2]) - + (i * dims_[4] * dims_[3] * dims_[2] * dims_[1])); +T& MPICArrayKokkos::operator()(size_t i, size_t j, size_t k, size_t l, size_t m) const { + assert(order_ == 5 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 5D!"); + assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 5D!"); + assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 5D!"); + assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 5D!"); + assert(l < dims_[3] && "l is out of bounds in MPICArrayKokkos 5D!"); + assert(m < dims_[4] && "m is out of bounds in MPICArrayKokkos 5D!"); + return this_array_(i, j, k, l, m); } template KOKKOS_INLINE_FUNCTION -T& MPIArrayKokkos::operator()(size_t i, size_t j, size_t k, size_t l, - size_t m, size_t n) const { - assert(order_ == 6 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 6D!"); - assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 6D!"); - assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 6D!"); - assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 6D!"); - assert(l >= 0 && l < dims_[3] && "l is out of bounds in MPIArrayKokkos 6D!"); - assert(m >= 0 && m < dims_[4] && "m is out of bounds in MPIArrayKokkos 6D!"); - assert(n >= 0 && n < dims_[5] && "n is out of bounds in MPIArrayKokkos 6D!"); - return this_array_.d_view(n + (m * dims_[5]) - + (l * dims_[5] * dims_[4]) - + (k * dims_[5] * dims_[4] * dims_[3]) - + (j * dims_[5] * dims_[4] * dims_[3] * dims_[2]) - + (i * dims_[5] * dims_[4] * dims_[3] * dims_[2] * dims_[1])); +T& MPICArrayKokkos::operator()(size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) const { + assert(order_ == 6 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 6D!"); + assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 6D!"); + assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 6D!"); + assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 6D!"); + assert(l < dims_[3] && "l is out of bounds in MPICArrayKokkos 6D!"); + assert(m < dims_[4] && "m is out of bounds in MPICArrayKokkos 6D!"); + assert(n < dims_[5] && "n is out of bounds in MPICArrayKokkos 6D!"); + return this_array_(i, j, k, l, m, n); } template KOKKOS_INLINE_FUNCTION -T& MPIArrayKokkos::operator()(size_t i, size_t j, size_t k, size_t l, - size_t m, size_t n, size_t o) const { - assert(order_ == 7 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 7D!"); - assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 7D!"); - assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 7D!"); - assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 7D!"); - assert(l >= 0 && l < dims_[3] && "l is out of bounds in MPIArrayKokkos 7D!"); - assert(m >= 0 && m < dims_[4] && "m is out of bounds in MPIArrayKokkos 7D!"); - assert(n >= 0 && n < dims_[5] && "n is out of bounds in MPIArrayKokkos 7D!"); - assert(o >= 0 && o < dims_[6] && "o is out of bounds in MPIArrayKokkos 7D!"); - return this_array_.d_view(o + (n * dims_[6]) - + (m * dims_[6] * dims_[5]) - + (l * dims_[6] * dims_[5] * dims_[4]) - + (k * dims_[6] * dims_[5] * dims_[4] * dims_[3]) - + (j * dims_[6] * dims_[5] * dims_[4] * dims_[3] * dims_[2]) - + (i * dims_[6] * dims_[5] * dims_[4] * dims_[3] * dims_[2] * dims_[1])); +T& MPICArrayKokkos::operator()(size_t i, size_t j, size_t k, size_t l, size_t m, size_t n, size_t o) const { + assert(order_ == 7 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 7D!"); + assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 7D!"); + assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 7D!"); + assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 7D!"); + assert(l < dims_[3] && "l is out of bounds in MPICArrayKokkos 7D!"); + assert(m < dims_[4] && "m is out of bounds in MPICArrayKokkos 7D!"); + assert(n < dims_[5] && "n is out of bounds in MPICArrayKokkos 7D!"); + assert(o < dims_[6] && "o is out of bounds in MPICArrayKokkos 7D!"); + return this_array_(i, j, k, l, m, n, o); } template KOKKOS_INLINE_FUNCTION -MPIArrayKokkos& MPIArrayKokkos::operator= (const MPIArrayKokkos& temp) { +MPICArrayKokkos& MPICArrayKokkos::operator=(const MPICArrayKokkos& temp) { // Do nothing if the assignment is of the form x = x if (this != &temp) { + + this_array_ = temp.this_array_; + send_buffer_ = temp.send_buffer_; + recv_buffer_ = temp.recv_buffer_; + + length_ = temp.length_; + for (int iter = 0; iter < temp.order_; iter++){ dims_[iter] = temp.dims_[iter]; } // end for order_ = temp.order_; - length_ = temp.length_; - this_array_ = temp.this_array_; - host = temp.host; - mpi_recv_rank_ = temp.mpi_recv_rank_; - mpi_tag_ = temp.mpi_tag_; - mpi_comm_ = temp.mpi_comm_; + mpi_status_ = temp.mpi_status_; mpi_datatype_ = temp.mpi_datatype_; mpi_request_ = temp.mpi_request_; + comm_plan_ = temp.comm_plan_; + + send_counts_ = temp.send_counts_; + recv_counts_ = temp.recv_counts_; + send_displs_ = temp.send_displs_; + recv_displs_ = temp.recv_displs_; + stride_ = temp.stride_; + + send_indices_ = temp.send_indices_; + recv_indices_ = temp.recv_indices_; + + num_owned_ = temp.num_owned_; + num_ghost_ = temp.num_ghost_; + + host = temp.host; // Also copy the host ViewCArray } - return *this; } // Return size template KOKKOS_INLINE_FUNCTION -size_t MPIArrayKokkos::size() const { - return length_; +size_t MPICArrayKokkos::size() const { + return this_array_.size(); } template KOKKOS_INLINE_FUNCTION -size_t MPIArrayKokkos::extent() const { - return length_; +size_t MPICArrayKokkos::extent() const { + return this_array_.extent(); } template KOKKOS_INLINE_FUNCTION -size_t MPIArrayKokkos::dims(size_t i) const { - assert(i < order_ && "MPIArrayKokkos order (rank) does not match constructor, dim[i] does not exist!"); - assert(i >= 0 && dims_[i]>0 && "Access to MPIArrayKokkos dims is out of bounds!"); - return dims_[i]; +size_t MPICArrayKokkos::dims(size_t i) const { + assert(i < order_ && "MPICArrayKokkos order (rank) does not match constructor, dim[i] does not exist!"); + assert(dims_[i] > 0 && "Access to MPICArrayKokkos dims is out of bounds!"); + return this_array_.dims(i); } template KOKKOS_INLINE_FUNCTION -size_t MPIArrayKokkos::order() const { - return order_; +size_t MPICArrayKokkos::order() const { + return this_array_.order(); } template KOKKOS_INLINE_FUNCTION -T* MPIArrayKokkos::device_pointer() const { - return this_array_.d_view.data(); +T* MPICArrayKokkos::device_pointer() const { + return this_array_.device_pointer(); } template KOKKOS_INLINE_FUNCTION -T* MPIArrayKokkos::host_pointer() const { - return this_array_.h_view.data(); +T* MPICArrayKokkos::host_pointer() const { + return this_array_.host_pointer(); } template KOKKOS_INLINE_FUNCTION -Kokkos::DualView MPIArrayKokkos::get_kokkos_dual_view() const { - return this_array_; +Kokkos::DualView MPICArrayKokkos::get_kokkos_dual_view() const { + return this_array_.get_kokkos_dual_view(); } template -void MPIArrayKokkos::update_host() { - - this_array_.template modify(); - this_array_.template sync(); -} - -template -void MPIArrayKokkos::update_device() { - - this_array_.template modify(); - this_array_.template sync(); -} - -// a default setup, should not be used except for testing -template -void MPIArrayKokkos::mpi_setup() { - mpi_recv_rank_ = 1; - mpi_tag_ = 99; - mpi_comm_ = MPI_COMM_WORLD; -} - -template -void MPIArrayKokkos::mpi_setup(int recv_rank) { - mpi_recv_rank_ = recv_rank; -} - -template -void MPIArrayKokkos::mpi_setup(int recv_rank, int tag) { - mpi_recv_rank_ = recv_rank; - mpi_tag_ = tag; -} - -template -void MPIArrayKokkos::mpi_setup(int recv_rank, int tag, MPI_Comm comm) { - mpi_recv_rank_ = recv_rank; - mpi_tag_ = tag; - mpi_comm_ = comm; -} - -template -void MPIArrayKokkos::mpi_set_rank(int recv_rank) { - mpi_recv_rank_ = recv_rank; +void MPICArrayKokkos::update_host() { + this_array_.update_host(); } template -void MPIArrayKokkos::mpi_set_tag(int tag) { - mpi_tag_ = tag; +void MPICArrayKokkos::update_device() { + this_array_.update_device(); } template -void MPIArrayKokkos::mpi_set_comm(MPI_Comm comm) { - mpi_comm_ = comm; -} - -template -int MPIArrayKokkos::get_rank() { - return mpi_recv_rank_; -} - -template -int MPIArrayKokkos::get_tag() { - return mpi_tag_; -} - -template -MPI_Comm MPIArrayKokkos::get_comm() { - return mpi_comm_; -} - -//MPI_Send wrapper -template -void MPIArrayKokkos::send(size_t count, int dest, int tag, MPI_Comm comm) { -#ifdef HAVE_GPU_AWARE_MPI - MPI_Send(device_pointer(), count, mpi_datatype_, dest, tag, comm); -#else - update_host(); - MPI_Send(host_pointer(), count, mpi_datatype_, dest, tag, comm); -#endif -} - -//MPI_Recv wrapper -template -void MPIArrayKokkos::recv(size_t count, int source, int tag, MPI_Comm comm) { -#ifdef HAVE_GPU_AWARE_MPI - MPI_Recv(device_pointer(), count, mpi_datatype_, source, tag, comm, &mpi_status_); -#else - MPI_Recv(host_pointer(), count, mpi_datatype_, source, tag, comm, &mpi_status_); - update_device(); -#endif -} - -//MPI_Send halo wrapper -template -void MPIArrayKokkos::halo_send() { -#ifdef HAVE_GPU_AWARE_MPI - MPI_Send(device_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_); -#else - update_host(); - MPI_Send(host_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_); -#endif -} - -//MPI_Recv halo wrapper -template -void MPIArrayKokkos::halo_recv() { -#ifdef HAVE_GPU_AWARE_MPI - MPI_Recv(device_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_status_); -#else - MPI_Recv(host_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_status_); - update_device(); -#endif -} - -//MPI_iSend halo wrapper -template -void MPIArrayKokkos::halo_isend() { -#ifdef HAVE_GPU_AWARE_MPI - MPI_Isend(device_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_request_); -#else - update_host(); - MPI_Isend(host_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_request_); -#endif -} - -//MPI_iRecv halo wrapper -template -void MPIArrayKokkos::halo_irecv() { -#ifdef HAVE_GPU_AWARE_MPI - MPI_Irecv(device_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_request_); -#else - MPI_Irecv(host_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_request_); -#endif -} - -//MPI_Bcast wrapper -template -void MPIArrayKokkos::broadcast(size_t count, int root, MPI_Comm comm) { -#ifdef HAVE_GPU_AWARE_MPI - MPI_Bcast(device_pointer(), count, mpi_datatype_, root, comm); -#else - update_host(); - MPI_Bcast(host_pointer(), count, mpi_datatype_, root, comm); - update_device(); -#endif -} - -//MPI_Scatter wrapper -template -void MPIArrayKokkos::scatter(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm) { -#ifdef HAVE_GPU_AWARE_MPI - MPI_Scatter(device_pointer(), send_count, mpi_datatype_, recv_buffer.device_pointer(), recv_count, mpi_datatype_, root, comm); -#else - update_host(); - MPI_Scatter(host_pointer(), send_count, mpi_datatype_, recv_buffer.host_pointer(), recv_count, mpi_datatype_, root, comm); - recv_buffer.update_device(); -#endif -} - -//MPI_Gather wrapper -template -void MPIArrayKokkos::gather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm) { -#ifdef HAVE_GPU_AWARE_MPI - MPI_Gather(device_pointer(), send_count, mpi_datatype_, recv_buffer.device_pointer(), recv_count, mpi_datatype_, root, comm); -#else - update_host(); - MPI_Gather(host_pointer(), send_count, mpi_datatype_, recv_buffer.host_pointer(), recv_count, mpi_datatype_, root, comm); - recv_buffer.update_device(); -#endif -} - -//MPI_AllGather wrapper -template -void MPIArrayKokkos::allgather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, MPI_Comm comm) { -#ifdef HAVE_GPU_AWARE_MPI - MPI_Allgather(device_pointer(), send_count, mpi_datatype_, recv_buffer.device_pointer(), recv_count, mpi_datatype_, comm); -#else - update_host(); - MPI_Allgather(host_pointer(), send_count, mpi_datatype_, recv_buffer.host_pointer(), recv_count, mpi_datatype_, comm); - recv_buffer.update_device(); -#endif -} - -//MPI_Isend wrapper -template -void MPIArrayKokkos::isend(size_t count, int dest, int tag, MPI_Comm comm) { -#ifdef HAVE_GPU_AWARE_MPI - MPI_Isend(device_pointer(), count, mpi_datatype_, dest, tag, comm, &mpi_request_); -#else - update_host(); - MPI_Isend(host_pointer(), count, mpi_datatype_, dest, tag, comm, &mpi_request_); -#endif -} - -//MPI_Irecv wrapper -template -void MPIArrayKokkos::irecv(size_t count, int source, int tag, MPI_Comm comm) { -#ifdef HAVE_GPU_AWARE_MPI - MPI_Irecv(device_pointer(), count, mpi_datatype_, source, tag, comm, &mpi_request_); -#else - MPI_Irecv(host_pointer(), count, mpi_datatype_, source, tag, comm, &mpi_request_); -#endif -} - -//MPI_Wait wrapper for the sender -template -void MPIArrayKokkos::wait_send() { - MPI_Wait(&mpi_request_, &mpi_status_); -} +KOKKOS_INLINE_FUNCTION +MPICArrayKokkos::~MPICArrayKokkos() { -//MPI_Wait wrapper for the receiver -template -void MPIArrayKokkos::wait_recv() { - MPI_Wait(&mpi_request_, &mpi_status_); -#ifndef HAVE_GPU_AWARE_MPI - update_device(); -#endif } -//MPI_Barrier wrapper -//template -//void MPIArrayKokkos::barrier(MPI_Comm comm) { -// MPI_Barrier(comm); -//} +} // end namespace mtr -template -KOKKOS_INLINE_FUNCTION -MPIArrayKokkos::~MPIArrayKokkos() {} - -//////////////////////////////////////////////////////////////////////////////// -// End of MPIArrayKokkos -//////////////////////////////////////////////////////////////////////////////// - -} // end namespace #endif // end if have MPI - -#endif // MPI_TYPES_H - +#endif // end if MPICARRAYKOKKOS_H \ No newline at end of file diff --git a/src/include/mpi_types_old.h b/src/include/mpi_types_old.h new file mode 100644 index 00000000..b10a57fc --- /dev/null +++ b/src/include/mpi_types_old.h @@ -0,0 +1,784 @@ +#ifndef MPI_TYPES_H +#define MPI_TYPES_H +/********************************************************************************************** + © 2020. Triad National Security, LLC. All rights reserved. + This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos + National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S. + Department of Energy/National Nuclear Security Administration. All rights in the program are + reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear + Security Administration. The Government is granted for itself and others acting on its behalf a + nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare + derivative works, distribute copies to the public, perform publicly and display publicly, and + to permit others to do so. + This program is open source under the BSD-3 License. + Redistribution and use in source and binary forms, with or without modification, are permitted + provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, this list of + conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, this list of + conditions and the following disclaimer in the documentation and/or other materials + provided with the distribution. + + 3. Neither the name of the copyright holder nor the names of its contributors may be used + to endorse or promote products derived from this software without specific prior + written permission. + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + **********************************************************************************************/ + +#include "host_types.h" +#include "kokkos_types.h" +#include +#ifdef HAVE_MPI +#include + +namespace mtr +{ + +///////////////////////// +// MPIArrayKokkos: Dual type for managing distributed data on both CPU and GPU. +///////////////////////// +template +class MPIArrayKokkos { + + // this is manage + using TArray1D = Kokkos::DualView ; + +protected: + size_t dims_[7]; + size_t length_; + size_t order_; // tensor order (rank) + int mpi_recv_rank_; + int mpi_tag_; + MPI_Comm mpi_comm_; + MPI_Status mpi_status_; + MPI_Datatype mpi_datatype_; + MPI_Request mpi_request_; + TArray1D this_array_; + + void set_mpi_type(); + +public: + // Data member to access host view + ViewCArray host; + + MPIArrayKokkos(); + + MPIArrayKokkos(size_t dim0, const std::string& tag_string = DEFAULTSTRINGARRAY); + + MPIArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY); + + MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY); + + MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2, + size_t dim3, const std::string& tag_string = DEFAULTSTRINGARRAY); + + MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2, + size_t dim3, size_t dim4, const std::string& tag_string = DEFAULTSTRINGARRAY); + + MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2, + size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string = DEFAULTSTRINGARRAY); + + MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2, + size_t dim3, size_t dim4, size_t dim5, + size_t dim6, const std::string& tag_string = DEFAULTSTRINGARRAY); + + // These functions can setup the data needed for halo send/receives + // Not necessary for standard MPI comms + void mpi_setup(); + + void mpi_setup(int recv_rank); + + void mpi_setup(int recv_rank, int tag); + + void mpi_setup(int recv_rank, int tag, MPI_Comm comm); + + void mpi_set_rank(int recv_rank); + + void mpi_set_tag(int tag); + + void mpi_set_comm(MPI_Comm comm); + + int get_rank(); + + int get_tag(); + + MPI_Comm get_comm(); + + KOKKOS_INLINE_FUNCTION + T& operator()(size_t i) const; + + KOKKOS_INLINE_FUNCTION + T& operator()(size_t i, size_t j) const; + + KOKKOS_INLINE_FUNCTION + T& operator()(size_t i, size_t j, size_t k) const; + + KOKKOS_INLINE_FUNCTION + T& operator()(size_t i, size_t j, size_t k, size_t l) const; + + KOKKOS_INLINE_FUNCTION + T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m) const; + + KOKKOS_INLINE_FUNCTION + T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m, + size_t n) const; + + KOKKOS_INLINE_FUNCTION + T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m, + size_t n, size_t o) const; + + KOKKOS_INLINE_FUNCTION + MPIArrayKokkos& operator=(const MPIArrayKokkos& temp); + + // GPU Method + // Method that returns size + KOKKOS_INLINE_FUNCTION + size_t size() const; + + // Host Method + // Method that returns size + KOKKOS_INLINE_FUNCTION + size_t extent() const; + + KOKKOS_INLINE_FUNCTION + size_t dims(size_t i) const; + + KOKKOS_INLINE_FUNCTION + size_t order() const; + + // Method returns the raw device pointer of the Kokkos DualView + KOKKOS_INLINE_FUNCTION + T* device_pointer() const; + + // Method returns the raw host pointer of the Kokkos DualView + KOKKOS_INLINE_FUNCTION + T* host_pointer() const; + + // Method returns kokkos dual view + KOKKOS_INLINE_FUNCTION + TArray1D get_kokkos_dual_view() const; + + // Method that update host view + void update_host(); + + // Method that update device view + void update_device(); + + // MPI send wrapper + void send(size_t count, int dest, int tag, MPI_Comm comm); + + // MPI recieve wrapper + void recv(size_t count, int dest, int tag, MPI_Comm comm); + + // MPI broadcast wrapper + void broadcast(size_t count, int root, MPI_Comm comm); + + // MPI scatter wrapper + void scatter(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm); + + // MPI gather wrapper + void gather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm); + + // MPI allgather wrapper + void allgather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, MPI_Comm comm); + + // MPI send wrapper + void isend(size_t count, int dest, int tag, MPI_Comm comm); + + // MPI recieve wrapper + void irecv(size_t count, int dest, int tag, MPI_Comm comm); + + // MPI wait wrapper for sender + void wait_send(); + + // MPI wait wrapper for receiver + void wait_recv(); + + // MPI barrier wrapper + //void barrier(MPI_Comm comm); + + // MPI send wrapper + void halo_send(); + + // MPI recieve wrapper + void halo_recv(); + + // MPI send wrapper + void halo_isend(); + + // MPI recieve wrapper + void halo_irecv(); + + // Deconstructor + virtual KOKKOS_INLINE_FUNCTION + ~MPIArrayKokkos (); +}; // End of MPIArrayKokkos + + +// Default constructor +template +MPIArrayKokkos::MPIArrayKokkos() { + length_ = order_ = 0; + for (int i = 0; i < 7; i++) { + dims_[i] = 0; + } +} + +// Overloaded 1D constructor +template +MPIArrayKokkos::MPIArrayKokkos(size_t dim0, const std::string& tag_string) { + + dims_[0] = dim0; + order_ = 1; + length_ = dim0; + this_array_ = TArray1D(tag_string, length_); + // Create host ViewCArray + host = ViewCArray (this_array_.h_view.data(), dim0); + set_mpi_type(); +} + +// Overloaded 2D constructor +template +MPIArrayKokkos::MPIArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string) { + + dims_[0] = dim0; + dims_[1] = dim1; + order_ = 2; + length_ = (dim0 * dim1); + this_array_ = TArray1D(tag_string, length_); + // Create host ViewCArray + host = ViewCArray (this_array_.h_view.data(), dim0, dim1); + set_mpi_type(); +} + +template +MPIArrayKokkos::MPIArrayKokkos(size_t dim0, size_t dim1, + size_t dim2, const std::string& tag_string) { + + dims_[0] = dim0; + dims_[1] = dim1; + dims_[2] = dim2; + order_ = 3; + length_ = (dim0 * dim1 * dim2); + this_array_ = TArray1D(tag_string, length_); + // Create host ViewCArray + host = ViewCArray (this_array_.h_view.data(), dim0, dim1, dim2); + set_mpi_type(); +} + +template +MPIArrayKokkos::MPIArrayKokkos(size_t dim0, size_t dim1, + size_t dim2, size_t dim3, const std::string& tag_string) { + + dims_[0] = dim0; + dims_[1] = dim1; + dims_[2] = dim2; + dims_[3] = dim3; + order_ = 4; + length_ = (dim0 * dim1 * dim2 * dim3); + this_array_ = TArray1D(tag_string, length_); + // Create host ViewCArray + host = ViewCArray (this_array_.h_view.data(), dim0, dim1, dim2, dim3); + set_mpi_type(); +} + +template +MPIArrayKokkos::MPIArrayKokkos(size_t dim0, size_t dim1, + size_t dim2, size_t dim3, + size_t dim4, const std::string& tag_string) { + + dims_[0] = dim0; + dims_[1] = dim1; + dims_[2] = dim2; + dims_[3] = dim3; + dims_[4] = dim4; + order_ = 5; + length_ = (dim0 * dim1 * dim2 * dim3 * dim4); + this_array_ = TArray1D(tag_string, length_); + // Create host ViewCArray + host = ViewCArray (this_array_.h_view.data(), dim0, dim1, dim2, dim3, dim4); + set_mpi_type(); +} + +template +MPIArrayKokkos::MPIArrayKokkos(size_t dim0, size_t dim1, + size_t dim2, size_t dim3, + size_t dim4, size_t dim5, const std::string& tag_string) { + + dims_[0] = dim0; + dims_[1] = dim1; + dims_[2] = dim2; + dims_[3] = dim3; + dims_[4] = dim4; + dims_[5] = dim5; + order_ = 6; + length_ = (dim0 * dim1 * dim2 * dim3 * dim4 * dim5); + this_array_ = TArray1D(tag_string, length_); + // Create host ViewCArray + host = ViewCArray (this_array_.h_view.data(), dim0, dim1, dim2, dim3, dim4, dim5); + set_mpi_type(); +} + +template +MPIArrayKokkos::MPIArrayKokkos(size_t dim0, size_t dim1, + size_t dim2, size_t dim3, + size_t dim4, size_t dim5, + size_t dim6, const std::string& tag_string) { + + dims_[0] = dim0; + dims_[1] = dim1; + dims_[2] = dim2; + dims_[3] = dim3; + dims_[4] = dim4; + dims_[5] = dim5; + dims_[6] = dim6; + order_ = 7; + length_ = (dim0 * dim1 * dim2 * dim3 * dim4 * dim5 * dim6); + this_array_ = TArray1D(tag_string, length_); + // Create host ViewCArray + host = ViewCArray (this_array_.h_view.data(), dim0, dim1, dim2, dim3, dim4, dim5, dim6); + set_mpi_type(); +} + +template +void MPIArrayKokkos::set_mpi_type() { + if (typeid(T).name() == typeid(bool).name()) { + mpi_datatype_ = MPI_C_BOOL; + } + else if (typeid(T).name() == typeid(int).name()) { + mpi_datatype_ = MPI_INT; + } + else if (typeid(T).name() == typeid(long int).name()) { + mpi_datatype_ = MPI_LONG; + } + else if (typeid(T).name() == typeid(long long int).name()) { + mpi_datatype_ = MPI_LONG_LONG_INT; + } + else if (typeid(T).name() == typeid(float).name()) { + mpi_datatype_ = MPI_FLOAT; + } + else if (typeid(T).name() == typeid(double).name()) { + mpi_datatype_ = MPI_DOUBLE; + } + else { + printf("Your entered MPIArrayKokkos type is not a supported type for MPI communications and is being set to int\n"); + mpi_datatype_ = MPI_INT; + } +} + +template +KOKKOS_INLINE_FUNCTION +T& MPIArrayKokkos::operator()(size_t i) const { + assert(order_ == 1 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 1D!"); + assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 1D!"); + return this_array_.d_view(i); +} + +template +KOKKOS_INLINE_FUNCTION +T& MPIArrayKokkos::operator()(size_t i, size_t j) const { + assert(order_ == 2 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 2D!"); + assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 2D!"); + assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 2D!"); + return this_array_.d_view(j + (i * dims_[1])); +} + +template +KOKKOS_INLINE_FUNCTION +T& MPIArrayKokkos::operator()(size_t i, size_t j, size_t k) const { + assert(order_ == 3 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 3D!"); + assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 3D!"); + assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 3D!"); + assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 3D!"); + return this_array_.d_view(k + (j * dims_[2]) + + (i * dims_[2] * dims_[1])); +} + +template +KOKKOS_INLINE_FUNCTION +T& MPIArrayKokkos::operator()(size_t i, size_t j, size_t k, size_t l) const { + assert(order_ == 4 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 4D!"); + assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 4D!"); + assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 4D!"); + assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 4D!"); + assert(l >= 0 && l < dims_[3] && "l is out of bounds in MPIArrayKokkos 4D!"); + return this_array_.d_view(l + (k * dims_[3]) + + (j * dims_[3] * dims_[2]) + + (i * dims_[3] * dims_[2] * dims_[1])); +} + +template +KOKKOS_INLINE_FUNCTION +T& MPIArrayKokkos::operator()(size_t i, size_t j, size_t k, size_t l, + size_t m) const { + assert(order_ == 5 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 5D!"); + assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 5D!"); + assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 5D!"); + assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 5D!"); + assert(l >= 0 && l < dims_[3] && "l is out of bounds in MPIArrayKokkos 5D!"); + assert(m >= 0 && m < dims_[4] && "m is out of bounds in MPIArrayKokkos 5D!"); + return this_array_.d_view(m + (l * dims_[4]) + + (k * dims_[4] * dims_[3]) + + (j * dims_[4] * dims_[3] * dims_[2]) + + (i * dims_[4] * dims_[3] * dims_[2] * dims_[1])); +} + +template +KOKKOS_INLINE_FUNCTION +T& MPIArrayKokkos::operator()(size_t i, size_t j, size_t k, size_t l, + size_t m, size_t n) const { + assert(order_ == 6 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 6D!"); + assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 6D!"); + assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 6D!"); + assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 6D!"); + assert(l >= 0 && l < dims_[3] && "l is out of bounds in MPIArrayKokkos 6D!"); + assert(m >= 0 && m < dims_[4] && "m is out of bounds in MPIArrayKokkos 6D!"); + assert(n >= 0 && n < dims_[5] && "n is out of bounds in MPIArrayKokkos 6D!"); + return this_array_.d_view(n + (m * dims_[5]) + + (l * dims_[5] * dims_[4]) + + (k * dims_[5] * dims_[4] * dims_[3]) + + (j * dims_[5] * dims_[4] * dims_[3] * dims_[2]) + + (i * dims_[5] * dims_[4] * dims_[3] * dims_[2] * dims_[1])); +} + +template +KOKKOS_INLINE_FUNCTION +T& MPIArrayKokkos::operator()(size_t i, size_t j, size_t k, size_t l, + size_t m, size_t n, size_t o) const { + assert(order_ == 7 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 7D!"); + assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 7D!"); + assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 7D!"); + assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 7D!"); + assert(l >= 0 && l < dims_[3] && "l is out of bounds in MPIArrayKokkos 7D!"); + assert(m >= 0 && m < dims_[4] && "m is out of bounds in MPIArrayKokkos 7D!"); + assert(n >= 0 && n < dims_[5] && "n is out of bounds in MPIArrayKokkos 7D!"); + assert(o >= 0 && o < dims_[6] && "o is out of bounds in MPIArrayKokkos 7D!"); + return this_array_.d_view(o + (n * dims_[6]) + + (m * dims_[6] * dims_[5]) + + (l * dims_[6] * dims_[5] * dims_[4]) + + (k * dims_[6] * dims_[5] * dims_[4] * dims_[3]) + + (j * dims_[6] * dims_[5] * dims_[4] * dims_[3] * dims_[2]) + + (i * dims_[6] * dims_[5] * dims_[4] * dims_[3] * dims_[2] * dims_[1])); +} + +template +KOKKOS_INLINE_FUNCTION +MPIArrayKokkos& MPIArrayKokkos::operator= (const MPIArrayKokkos& temp) { + + // Do nothing if the assignment is of the form x = x + if (this != &temp) { + for (int iter = 0; iter < temp.order_; iter++){ + dims_[iter] = temp.dims_[iter]; + } // end for + + order_ = temp.order_; + length_ = temp.length_; + this_array_ = temp.this_array_; + host = temp.host; + mpi_recv_rank_ = temp.mpi_recv_rank_; + mpi_tag_ = temp.mpi_tag_; + mpi_comm_ = temp.mpi_comm_; + mpi_status_ = temp.mpi_status_; + mpi_datatype_ = temp.mpi_datatype_; + mpi_request_ = temp.mpi_request_; + } + + return *this; +} + +// Return size +template +KOKKOS_INLINE_FUNCTION +size_t MPIArrayKokkos::size() const { + return length_; +} + +template +KOKKOS_INLINE_FUNCTION +size_t MPIArrayKokkos::extent() const { + return length_; +} + +template +KOKKOS_INLINE_FUNCTION +size_t MPIArrayKokkos::dims(size_t i) const { + assert(i < order_ && "MPIArrayKokkos order (rank) does not match constructor, dim[i] does not exist!"); + assert(i >= 0 && dims_[i]>0 && "Access to MPIArrayKokkos dims is out of bounds!"); + return dims_[i]; +} + +template +KOKKOS_INLINE_FUNCTION +size_t MPIArrayKokkos::order() const { + return order_; +} + +template +KOKKOS_INLINE_FUNCTION +T* MPIArrayKokkos::device_pointer() const { + return this_array_.d_view.data(); +} + +template +KOKKOS_INLINE_FUNCTION +T* MPIArrayKokkos::host_pointer() const { + return this_array_.h_view.data(); +} + +template +KOKKOS_INLINE_FUNCTION +Kokkos::DualView MPIArrayKokkos::get_kokkos_dual_view() const { + return this_array_; +} + +template +void MPIArrayKokkos::update_host() { + + this_array_.template modify(); + this_array_.template sync(); +} + +template +void MPIArrayKokkos::update_device() { + + this_array_.template modify(); + this_array_.template sync(); +} + +// a default setup, should not be used except for testing +template +void MPIArrayKokkos::mpi_setup() { + mpi_recv_rank_ = 1; + mpi_tag_ = 99; + mpi_comm_ = MPI_COMM_WORLD; +} + +template +void MPIArrayKokkos::mpi_setup(int recv_rank) { + mpi_recv_rank_ = recv_rank; +} + +template +void MPIArrayKokkos::mpi_setup(int recv_rank, int tag) { + mpi_recv_rank_ = recv_rank; + mpi_tag_ = tag; +} + +template +void MPIArrayKokkos::mpi_setup(int recv_rank, int tag, MPI_Comm comm) { + mpi_recv_rank_ = recv_rank; + mpi_tag_ = tag; + mpi_comm_ = comm; +} + +template +void MPIArrayKokkos::mpi_set_rank(int recv_rank) { + mpi_recv_rank_ = recv_rank; +} + +template +void MPIArrayKokkos::mpi_set_tag(int tag) { + mpi_tag_ = tag; +} + +template +void MPIArrayKokkos::mpi_set_comm(MPI_Comm comm) { + mpi_comm_ = comm; +} + +template +int MPIArrayKokkos::get_rank() { + return mpi_recv_rank_; +} + +template +int MPIArrayKokkos::get_tag() { + return mpi_tag_; +} + +template +MPI_Comm MPIArrayKokkos::get_comm() { + return mpi_comm_; +} + +//MPI_Send wrapper +template +void MPIArrayKokkos::send(size_t count, int dest, int tag, MPI_Comm comm) { +#ifdef HAVE_GPU_AWARE_MPI + MPI_Send(device_pointer(), count, mpi_datatype_, dest, tag, comm); +#else + update_host(); + MPI_Send(host_pointer(), count, mpi_datatype_, dest, tag, comm); +#endif +} + +//MPI_Recv wrapper +template +void MPIArrayKokkos::recv(size_t count, int source, int tag, MPI_Comm comm) { +#ifdef HAVE_GPU_AWARE_MPI + MPI_Recv(device_pointer(), count, mpi_datatype_, source, tag, comm, &mpi_status_); +#else + MPI_Recv(host_pointer(), count, mpi_datatype_, source, tag, comm, &mpi_status_); + update_device(); +#endif +} + +//MPI_Send halo wrapper +template +void MPIArrayKokkos::halo_send() { +#ifdef HAVE_GPU_AWARE_MPI + MPI_Send(device_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_); +#else + update_host(); + MPI_Send(host_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_); +#endif +} + +//MPI_Recv halo wrapper +template +void MPIArrayKokkos::halo_recv() { +#ifdef HAVE_GPU_AWARE_MPI + MPI_Recv(device_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_status_); +#else + MPI_Recv(host_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_status_); + update_device(); +#endif +} + +//MPI_iSend halo wrapper +template +void MPIArrayKokkos::halo_isend() { +#ifdef HAVE_GPU_AWARE_MPI + MPI_Isend(device_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_request_); +#else + update_host(); + MPI_Isend(host_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_request_); +#endif +} + +//MPI_iRecv halo wrapper +template +void MPIArrayKokkos::halo_irecv() { +#ifdef HAVE_GPU_AWARE_MPI + MPI_Irecv(device_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_request_); +#else + MPI_Irecv(host_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_request_); +#endif +} + +//MPI_Bcast wrapper +template +void MPIArrayKokkos::broadcast(size_t count, int root, MPI_Comm comm) { +#ifdef HAVE_GPU_AWARE_MPI + MPI_Bcast(device_pointer(), count, mpi_datatype_, root, comm); +#else + update_host(); + MPI_Bcast(host_pointer(), count, mpi_datatype_, root, comm); + update_device(); +#endif +} + +//MPI_Scatter wrapper +template +void MPIArrayKokkos::scatter(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm) { +#ifdef HAVE_GPU_AWARE_MPI + MPI_Scatter(device_pointer(), send_count, mpi_datatype_, recv_buffer.device_pointer(), recv_count, mpi_datatype_, root, comm); +#else + update_host(); + MPI_Scatter(host_pointer(), send_count, mpi_datatype_, recv_buffer.host_pointer(), recv_count, mpi_datatype_, root, comm); + recv_buffer.update_device(); +#endif +} + +//MPI_Gather wrapper +template +void MPIArrayKokkos::gather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm) { +#ifdef HAVE_GPU_AWARE_MPI + MPI_Gather(device_pointer(), send_count, mpi_datatype_, recv_buffer.device_pointer(), recv_count, mpi_datatype_, root, comm); +#else + update_host(); + MPI_Gather(host_pointer(), send_count, mpi_datatype_, recv_buffer.host_pointer(), recv_count, mpi_datatype_, root, comm); + recv_buffer.update_device(); +#endif +} + +//MPI_AllGather wrapper +template +void MPIArrayKokkos::allgather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, MPI_Comm comm) { +#ifdef HAVE_GPU_AWARE_MPI + MPI_Allgather(device_pointer(), send_count, mpi_datatype_, recv_buffer.device_pointer(), recv_count, mpi_datatype_, comm); +#else + update_host(); + MPI_Allgather(host_pointer(), send_count, mpi_datatype_, recv_buffer.host_pointer(), recv_count, mpi_datatype_, comm); + recv_buffer.update_device(); +#endif +} + +//MPI_Isend wrapper +template +void MPIArrayKokkos::isend(size_t count, int dest, int tag, MPI_Comm comm) { +#ifdef HAVE_GPU_AWARE_MPI + MPI_Isend(device_pointer(), count, mpi_datatype_, dest, tag, comm, &mpi_request_); +#else + update_host(); + MPI_Isend(host_pointer(), count, mpi_datatype_, dest, tag, comm, &mpi_request_); +#endif +} + +//MPI_Irecv wrapper +template +void MPIArrayKokkos::irecv(size_t count, int source, int tag, MPI_Comm comm) { +#ifdef HAVE_GPU_AWARE_MPI + MPI_Irecv(device_pointer(), count, mpi_datatype_, source, tag, comm, &mpi_request_); +#else + MPI_Irecv(host_pointer(), count, mpi_datatype_, source, tag, comm, &mpi_request_); +#endif +} + +//MPI_Wait wrapper for the sender +template +void MPIArrayKokkos::wait_send() { + MPI_Wait(&mpi_request_, &mpi_status_); +} + +//MPI_Wait wrapper for the receiver +template +void MPIArrayKokkos::wait_recv() { + MPI_Wait(&mpi_request_, &mpi_status_); +#ifndef HAVE_GPU_AWARE_MPI + update_device(); +#endif +} + +//MPI_Barrier wrapper +//template +//void MPIArrayKokkos::barrier(MPI_Comm comm) { +// MPI_Barrier(comm); +//} + +template +KOKKOS_INLINE_FUNCTION +MPIArrayKokkos::~MPIArrayKokkos() {} + +//////////////////////////////////////////////////////////////////////////////// +// End of MPIArrayKokkos +//////////////////////////////////////////////////////////////////////////////// + +} // end namespace + +#endif // end if have MPI + +#endif // MPI_TYPES_H + diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 8f7fa4c2..e6c2bfaf 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.1.3) +cmake_minimum_required(VERSION 3.5) project (matartest) diff --git a/test/test_cases/CMakeLists.txt b/test/test_cases/CMakeLists.txt index 01cc23c0..a0e07edd 100644 --- a/test/test_cases/CMakeLists.txt +++ b/test/test_cases/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.1.3) +cmake_minimum_required(VERSION 3.5) # Find all test files in the current directory except test_main.cpp file(GLOB TEST_SOURCES "test_*.cpp")