From 2cecf1a1201bf49b5bca931429c1a2dfe3694600 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Wed, 22 Oct 2025 14:02:26 -0500 Subject: [PATCH 01/52] ENH: Adding example for mesh decomposition WIP --- examples/mesh_decomp/CMakeLists.txt | 13 + examples/mesh_decomp/install_ptscotch.sh | 31 + examples/mesh_decomp/mesh.h | 1481 +++++++ examples/mesh_decomp/mesh_decomp.cpp | 32 + examples/mesh_decomp/mesh_io.h | 4894 ++++++++++++++++++++++ 5 files changed, 6451 insertions(+) create mode 100644 examples/mesh_decomp/CMakeLists.txt create mode 100755 examples/mesh_decomp/install_ptscotch.sh create mode 100644 examples/mesh_decomp/mesh.h create mode 100644 examples/mesh_decomp/mesh_decomp.cpp create mode 100644 examples/mesh_decomp/mesh_io.h diff --git a/examples/mesh_decomp/CMakeLists.txt b/examples/mesh_decomp/CMakeLists.txt new file mode 100644 index 00000000..721859a8 --- /dev/null +++ b/examples/mesh_decomp/CMakeLists.txt @@ -0,0 +1,13 @@ +cmake_minimum_required(VERSION 3.1.3) + +find_package(Matar REQUIRED) + +if (KOKKOS) + #find_package(Kokkos REQUIRED) #new + + add_executable(mech_decomp mesh_decomp.cpp) + + add_definitions(-DHAVE_KOKKOS=1) + + target_link_libraries(mesh_decomp ${LINKING_LIBRARIES}) +endif() diff --git a/examples/mesh_decomp/install_ptscotch.sh b/examples/mesh_decomp/install_ptscotch.sh new file mode 100755 index 00000000..95ad7914 --- /dev/null +++ b/examples/mesh_decomp/install_ptscotch.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# Install script for Scotch and PT-Scotch +set -e + +# Configuration +LIB_DIR="lib" +# SCOTCH_VERSION="7.0.4" +# PTSCOTCH_VERSION="7.0.4" +# INSTALL_PREFIX="$(pwd)/${LIB_DIR}" + +# echo "Installing Scotch and PT-Scotch to ${INSTALL_PREFIX}" + +# Create lib directory +mkdir -p "${LIB_DIR}" +cd ${LIB_DIR} +# Clone and build Scotch +echo "Cloning Scotch..." +if [ -d "scotch" ]; then + rm -rf scotch +fi +git clone https://gitlab.inria.fr/scotch/scotch.git +cd scotch + +echo "Building Scotch..." +mkdir build +cd build +cmake .. +make + +echo "Installation complete! Libraries installed in: ${INSTALL_PREFIX}" \ No newline at end of file diff --git a/examples/mesh_decomp/mesh.h b/examples/mesh_decomp/mesh.h new file mode 100644 index 00000000..599cb77d --- /dev/null +++ b/examples/mesh_decomp/mesh.h @@ -0,0 +1,1481 @@ +/********************************************************************************************** +� 2020. Triad National Security, LLC. All rights reserved. +This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos +National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S. +Department of Energy/National Nuclear Security Administration. All rights in the program are +reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear +Security Administration. The Government is granted for itself and others acting on its behalf a +nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare +derivative works, distribute copies to the public, perform publicly and display publicly, and +to permit others to do so. +This program is open source under the BSD-3 License. +Redistribution and use in source and binary forms, with or without modification, are permitted +provided that the following conditions are met: +1. Redistributions of source code must retain the above copyright notice, this list of +conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, this list of +conditions and the following disclaimer in the documentation and/or other materials +provided with the distribution. +3. Neither the name of the copyright holder nor the names of its contributors may be used +to endorse or promote products derived from this software without specific prior +written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************************/ +#ifndef MESH_H +#define MESH_H + +#include "matar.h" +#include "state.h" +#include "ref_elem.h" +#include + +#define PI 3.141592653589793 + +using namespace mtr; + +namespace mesh_init +{ +// element mesh types +enum elem_name_tag +{ + linear_simplex_element = 0, + linear_tensor_element = 1, + arbitrary_tensor_element = 2 +}; + +// other enums could go here on the mesh +} // end namespace + + +/* +========================== +Nodal indexing convention +========================== + + K + ^ J + | / + | / + | / + 6------------------7 + /| /| + / | / | + / | / | + / | / | + / | / | +4------------------5 | +| | | | ----> I +| | | | +| | | | +| | | | +| 2------------|-----3 +| / | / +| / | / +| / | / +| / | / +|/ |/ +0------------------1 + +nodes are ordered for outward normal +patch 0: [0,4,6,2] xi-minus dir +patch 1: [1,3,7,5] xi-plus dir +patch 2: [0,1,5,4] eta-minus dir +patch 3: [3,2,6,7] eta-plus dir +patch 4: [0,2,3,1] zeta-minus dir +patch 6: [4,5,7,6] zeta-plus dir +*/ + +// sort in ascending order using bubble sort +KOKKOS_INLINE_FUNCTION +void bubble_sort(size_t arr[], const size_t num) +{ + for (size_t i = 0; i < (num - 1); i++) { + for (size_t j = 0; j < (num - i - 1); j++) { + if (arr[j] > arr[j + 1]) { + size_t temp = arr[j]; + arr[j] = arr[j + 1]; + arr[j + 1] = temp; + } // end if + } // end for j + } // end for i +} // end function + +struct zones_in_elem_t +{ + private: + size_t num_zones_in_elem_; + public: + zones_in_elem_t() { + }; + + zones_in_elem_t(const size_t num_zones_in_elem_inp) { + this->num_zones_in_elem_ = num_zones_in_elem_inp; + }; + + // return global zone index for given local zone index in an element + size_t host(const size_t elem_gid, const size_t zone_lid) const + { + return elem_gid * num_zones_in_elem_ + zone_lid; + }; + + // Return the global zone ID given an element gloabl ID and a local zone ID + KOKKOS_INLINE_FUNCTION + size_t operator()(const size_t elem_gid, const size_t zone_lid) const + { + return elem_gid * num_zones_in_elem_ + zone_lid; + }; +}; + +// if material points are defined strictly internal to the element. +struct gauss_in_elem_t +{ + private: + size_t num_gauss_in_elem_; + public: + gauss_in_elem_t() { + }; + + gauss_in_elem_t(const size_t num_gauss_in_elem_inp) { + this->num_gauss_in_elem_ = num_gauss_in_elem_inp; + }; + + // return global gauss index for given local gauss index in an element + size_t host(const size_t elem_gid, const size_t leg_gauss_lid) const + { + return elem_gid * num_gauss_in_elem_ + leg_gauss_lid; + }; + + // Return the global gauss ID given an element gloabl ID and a local gauss ID + KOKKOS_INLINE_FUNCTION + size_t operator()(const size_t elem_gid, const size_t leg_gauss_lid) const + { + return elem_gid * num_gauss_in_elem_ + leg_gauss_lid; + }; +}; + +/// if material points are defined at element interfaces +struct lobatto_in_elem_t +{ + private: + size_t num_lobatto_in_elem_; + public: + lobatto_in_elem_t() { + }; + + lobatto_in_elem_t(const size_t num_lobatto_in_elem_inp) { + this->num_lobatto_in_elem_ = num_lobatto_in_elem_inp; + }; + + // return global gauss index for given local gauss index in an element + size_t host(const size_t elem_gid, const size_t lob_gauss_lid) const + { + return elem_gid * num_lobatto_in_elem_ + lob_gauss_lid; + }; + + // Return the global gauss ID given an element gloabl ID and a local gauss ID + KOKKOS_INLINE_FUNCTION + size_t operator()(const size_t elem_gid, const size_t lob_gauss_lid) const + { + return elem_gid * num_lobatto_in_elem_ + lob_gauss_lid; + }; +}; + +// struct nodes_in_zone_t { +// private: +// size_t num_nodes_in_zone_; +// public: +// nodes_in_zone_t(){}; + +// nodes_in_zone_t(const size_t num_nodes_in_zone_inp){ +// this->num_nodes_in_zone_ = num_nodes_in_zone_inp; +// }; + +// // return global zone index for given local zone index in an element +// size_t host(const size_t zone_gid, const size_t node_lid) const{ +// return zone_gid*num_nodes_in_zone_ + node_lid; +// }; + +// KOKKOS_INLINE_FUNCTION +// size_t operator()(const size_t zone_gid, const size_t node_lid) const{ +// return zone_gid*num_nodes_in_zone_ + node_lid; +// }; +// }; + +// mesh sizes and connectivity data structures +struct Mesh_t +{ + // ******* Entity Definitions **********// + // Element: A hexahedral volume + // Zone: A discretization of an element base on subdividing the element using the nodes + // Node: A kinematic degree of freedom + // Surface: The 2D surface of the element + // Patch: A discretization of a surface by subdividing the surface using the nodes + // Corner: A element-node pair + + // ---- Global Mesh Definitions ---- // + mesh_init::elem_name_tag elem_kind = mesh_init::linear_tensor_element; ///< The type of elements used in the mesh + + size_t Pn = 1; ///< Polynomial order of kinematic space + size_t num_dims = 3; ///< Number of spatial dimension + + // ---- Element Data Definitions ---- // + size_t num_elems; ///< Number of elements in the mesh + size_t num_nodes_in_elem; ///< Number of nodes in an element + size_t num_patches_in_elem; ///< Number of patches in an element + size_t num_surfs_in_elem; ///< Number of surfaces in an element + size_t num_zones_in_elem; ///< Number of zones in an element + + size_t num_gauss_in_elem; ///< Number of Gauss points in an element + size_t num_lobatto_in_elem; ///< Number of Gauss Lobatto points in an element + + DCArrayKokkos nodes_in_elem; ///< Nodes in an element + CArrayKokkos corners_in_elem; ///< Corners in an element -- this can just be a functor + + RaggedRightArrayKokkos elems_in_elem; ///< Elements connected to an element + CArrayKokkos num_elems_in_elem; ///< Number of elements connected to an element + + CArrayKokkos patches_in_elem; ///< Patches in an element (including internal patches) + CArrayKokkos surfs_in_elem; ///< Surfaces on an element + + // CArrayKokkos zones_in_elem; ///< Zones in an element + zones_in_elem_t zones_in_elem; ///< Zones in an element + lobatto_in_elem_t lobatto_in_elem; ///< Gauss Lobatto points in an element + gauss_in_elem_t gauss_in_elem; ///< Gauss points in an element + + // ---- Node Data Definitions ---- // + size_t num_nodes; ///< Number of nodes in the mesh + + RaggedRightArrayKokkos corners_in_node; ///< Corners connected to a node + CArrayKokkos num_corners_in_node; ///< Number of corners connected to a node + RaggedRightArrayKokkos elems_in_node; ///< Elements connected to a given node + RaggedRightArrayKokkos nodes_in_node; ///< Nodes connected to a node along an edge + CArrayKokkos num_nodes_in_node; ///< Number of nodes connected to a node along an edge + + // ---- Surface Data Definitions ---- // + size_t num_surfs; ///< Number of surfaces in the mesh + size_t num_nodes_in_surf; ///< Number of nodes in a surface + size_t num_patches_in_surf; ///< Number of patches in a surface + + CArrayKokkos patches_in_surf; ///< Patches in a surface + CArrayKokkos nodes_in_surf; ///< Nodes connected to a surface + CArrayKokkos elems_in_surf; ///< Elements connected to a surface + + // ---- Patch Data Definitions ---- // + size_t num_patches; ///< Number of patches in the mesh + size_t num_nodes_in_patch; ///< Number of nodes in a patch + // size_t num_lobatto_in_patch; ///< Number of Gauss Lobatto nodes in a patch + // size_t num_gauss_in_patch; ///< Number of Gauss nodes in a patch + + CArrayKokkos nodes_in_patch; ///< Nodes connected to a patch + CArrayKokkos elems_in_patch; ///< Elements connected to a patch + CArrayKokkos surf_in_patch; ///< Surfaces connected to a patch (co-planar) + + // ---- Corner Data Definitions ---- // + size_t num_corners; ///< Number of corners (define) in the mesh + + // ---- Zone Data Definitions ---- // + size_t num_zones; ///< Number of zones in the mesh + size_t num_nodes_in_zone; ///< Number of nodes in a zone + + CArrayKokkos nodes_in_zone; ///< Nodes defining a zone + // nodes_in_zone_t nodes_in_zone; + + // ---- Boundary Data Definitions ---- // + size_t num_bdy_sets; ///< Number of boundary sets + size_t num_bdy_nodes; ///< Number of boundary nodes + size_t num_bdy_patches; ///< Number of boundary patches + + CArrayKokkos bdy_patches; ///< Boundary patches + CArrayKokkos bdy_nodes; ///< Boundary nodes + + RaggedRightArrayKokkos bdy_patches_in_set; ///< Boundary patches in a boundary set + DCArrayKokkos num_bdy_patches_in_set; ///< Number of boundary nodes in a set + + RaggedRightArrayKokkos bdy_nodes_in_set; ///< Boundary nodes in a boundary set + DCArrayKokkos num_bdy_nodes_in_set; ///< Number of boundary nodes in a set + + // initialization methods + void initialize_nodes(const size_t num_nodes_inp) + { + num_nodes = num_nodes_inp; + + return; + }; // end method + + // initialization methods + void initialize_elems(const size_t num_elems_inp, const size_t num_dims_inp) + { + num_dims = num_dims_inp; + num_nodes_in_elem = 1; + + for (int dim = 0; dim < num_dims; dim++) { + num_nodes_in_elem *= 2; + } + num_elems = num_elems_inp; + nodes_in_elem = DCArrayKokkos(num_elems, num_nodes_in_elem, "mesh.nodes_in_elem"); + corners_in_elem = CArrayKokkos(num_elems, num_nodes_in_elem, "mesh.corners_in_elem"); + + // 1 Gauss point per element + num_gauss_in_elem = 1; + + // 1 zone per element + num_zones_in_elem = 1; + + gauss_in_elem = gauss_in_elem_t(num_gauss_in_elem); + + return; + }; // end method + + // initialization method + void initialize_elems_Pn(const size_t num_elems_inp, + const size_t num_nodes_in_elem_inp, + const size_t num_gauss_leg_in_elem_inp, + const size_t num_zones_in_elem_inp, + const size_t num_nodes_in_zone_inp, + const size_t num_surfs_in_elem_inp, + const size_t num_dims_inp) + { + num_dims = num_dims_inp; + num_elems = num_elems_inp; + + num_nodes_in_elem = num_nodes_in_elem_inp; + num_nodes_in_zone = num_nodes_in_zone_inp; + num_gauss_in_elem = num_gauss_leg_in_elem_inp; + num_zones_in_elem = num_zones_in_elem_inp; + num_surfs_in_elem = num_surfs_in_elem_inp; + + num_zones = num_zones_in_elem * num_elems; + + nodes_in_elem = DCArrayKokkos(num_elems, num_nodes_in_elem, "mesh.nodes_in_elem"); + corners_in_elem = CArrayKokkos(num_elems, num_nodes_in_elem, "mesh.corners_in_elem"); + zones_in_elem = zones_in_elem_t(num_zones_in_elem); + surfs_in_elem = CArrayKokkos(num_elems, num_surfs_in_elem, "mesh.surfs_in_zone"); + nodes_in_zone = CArrayKokkos(num_zones, num_nodes_in_zone, "mesh.nodes_in_zone"); + gauss_in_elem = gauss_in_elem_t(num_gauss_in_elem); + + return; + }; // end method + + // initialization methods + void initialize_corners(const size_t num_corners_inp) + { + num_corners = num_corners_inp; + + return; + }; // end method + + // build the corner mesh connectivity arrays + void build_corner_connectivity() + { + num_corners_in_node = CArrayKokkos(num_nodes, "mesh.num_corners_in_node"); // stride sizes + + // initializing the number of corners (node-cell pair) to be zero + FOR_ALL_CLASS(node_gid, 0, num_nodes, { + num_corners_in_node(node_gid) = 0; + }); + + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { + FOR_ALL_CLASS(node_lid, 0, num_nodes_in_elem, { + // get the global_id of the node + size_t node_gid = nodes_in_elem(elem_gid, node_lid); + + // increment the number of corners attached to this point + num_corners_in_node(node_gid) = num_corners_in_node(node_gid) + 1; + }); // end FOR_ALL over nodes in element + } // end for elem_gid + + // the stride sizes are the num_corners_in_node at the node + corners_in_node = RaggedRightArrayKokkos(num_corners_in_node, "mesh.corners_in_node"); + + CArrayKokkos count_saved_corners_in_node(num_nodes, "count_saved_corners_in_node"); + + // reset num_corners to zero + FOR_ALL_CLASS(node_gid, 0, num_nodes, { + count_saved_corners_in_node(node_gid) = 0; + }); + + // the elems_in_elem data type + elems_in_node = RaggedRightArrayKokkos(num_corners_in_node, "mesh.elems_in_node"); + + // populate the elements connected to a node list and corners in a node + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { + FOR_ALL_CLASS(node_lid, 0, num_nodes_in_elem, { + // get the global_id of the node + size_t node_gid = nodes_in_elem(elem_gid, node_lid); + + // the column index is the num corners saved + size_t j = count_saved_corners_in_node(node_gid); + + // Save corner index to this node_gid + size_t corner_gid = node_lid + elem_gid * num_nodes_in_elem; // this can be a functor + corners_in_node(node_gid, j) = corner_gid; + + elems_in_node(node_gid, j) = elem_gid; // save the elem_gid + + // Save corner index to element + size_t corner_lid = node_lid; + corners_in_elem(elem_gid, corner_lid) = corner_gid; + + // increment the number of corners saved to this node_gid + count_saved_corners_in_node(node_gid) = count_saved_corners_in_node(node_gid) + 1; + }); // end FOR_ALL over nodes in element + } // end for elem_gid + + return; + } // end of build_corner_connectivity + + // build elem connectivity arrays + void build_elem_elem_connectivity() + { + // find the max number of elems around a node + size_t max_num_elems_in_node; + size_t max_num_lcl; + FOR_REDUCE_MAX_CLASS(node_gid, 0, num_nodes, max_num_lcl, { + // num_corners_in_node = num_elems_in_node + size_t max_num = num_corners_in_node(node_gid); + + if (max_num > max_num_lcl) { + max_num_lcl = max_num; + } + }, max_num_elems_in_node); // end parallel reduction on max + Kokkos::fence(); + + // a temporary ragged array to save the elems around an elem + DynamicRaggedRightArrayKokkos temp_elems_in_elem(num_nodes, num_nodes_in_elem * max_num_elems_in_node, "temp_elems_in_elem"); + + num_elems_in_elem = CArrayKokkos(num_elems, "mesh.num_elems_in_elem"); + FOR_ALL_CLASS(elem_gid, 0, num_elems, { + num_elems_in_elem(elem_gid) = 0; + }); + Kokkos::fence(); + + // find and save neighboring elem_gids of an elem + FOR_ALL_CLASS(elem_gid, 0, num_elems, { + for (int node_lid = 0; node_lid < num_nodes_in_elem; node_lid++) { + // get the gid for the node + size_t node_id = nodes_in_elem(elem_gid, node_lid); + + // loop over all elems connected to node_gid + for (int elem_lid = 0; elem_lid < num_corners_in_node(node_id); elem_lid++) { + // get the global id for the neighboring elem + size_t neighbor_elem_gid = elems_in_node(node_id, elem_lid); + + // a flag to save (=1) or not (=0) + size_t save = 1; + + // a true neighbor_elem_id is not equal to elem_gid + if (neighbor_elem_gid == elem_gid) { + save = 0; // don't save + } // end if + + // check to see if the neighbor_elem_gid has been saved already + size_t num_saved = temp_elems_in_elem.stride(elem_gid); + for (size_t i = 0; i < num_saved; i++) { + if (neighbor_elem_gid == temp_elems_in_elem(elem_gid, i)) { + save = 0; // don't save, it has been saved already + } // end if + } // end for i + + if (save == 1) { + // increment the number of neighboring elements saved + temp_elems_in_elem.stride(elem_gid)++; + + // save the neighboring elem_gid + temp_elems_in_elem(elem_gid, num_saved) = neighbor_elem_gid; + } // end if save + } // end for elem_lid in a node + } // end for node_lid in an elem + + // save the actial stride size + num_elems_in_elem(elem_gid) = temp_elems_in_elem.stride(elem_gid); + }); // end FOR_ALL elems + Kokkos::fence(); + + // compress out the extra space in the temp_elems_in_elem + elems_in_elem = RaggedRightArrayKokkos(num_elems_in_elem, "mesh.elems_in_elem"); + + FOR_ALL_CLASS(elem_gid, 0, num_elems, { + for (size_t i = 0; i < num_elems_in_elem(elem_gid); i++) { + elems_in_elem(elem_gid, i) = temp_elems_in_elem(elem_gid, i); + } // end for i + }); // end FOR_ALL elems + Kokkos::fence(); + + return; + } // end of build_elem_elem_connectivity + + // build the patches + void build_patch_connectivity() + { + // WARNING WARNING + // the mesh element kind should be in the input file and set when reading mesh + // mesh_elem_kind = mesh_init::linear_tensor_element; // MUST BE SET + + // building patches + + num_nodes_in_patch = 2 * (num_dims - 1); // 2 (2D) or 4 (3D) + num_surfs_in_elem = 2 * num_dims; // 4 (2D) or 6 (3D) + + // num_lobatto_in_patch = int(pow(3, num_dims-1)); + + // num_gauss_in_patch = 2*(num_dims-1); + + size_t num_patches_in_surf; // = Pn_order or = Pn_order*Pn_order + + size_t num_1D = Pn + 1; // number of nodes in 1D + + // num quad points 1D // + // size_t num_lob_1D = 2*Pn + 1; + // size_t num_1D = 2*Pn; + + DCArrayKokkos node_ordering_in_elem; // dimensions will be (num_patches_in_elem, num_nodes_in_patch); + + // DCArrayKokkos lobatto_ordering_in_elem; // dimensions will be (num_patches_in_elem, num_lobatto_in_patch); + + // DCArrayKokkos gauss_ordering_in_elem; // dimensions will be (num_patches_in_elem, num_gauss_in_patch); + + printf("Number of dimensions = %zu \n", num_dims); + + if (num_dims == 3) { + // num_patches_in_surf = [1^2, 2^2, 3^2, 4^2, ... , Pn^2] + + num_patches_in_surf = Pn * Pn; + + num_patches_in_elem = num_patches_in_surf * num_surfs_in_elem; + + // nodes in a patch in the element + node_ordering_in_elem = DCArrayKokkos(num_patches_in_elem, num_nodes_in_patch, "node_ordering_in_elem"); + + // lobatto_ordering_in_elem = DCArrayKokkos (num_patches_in_elem, num_lobatto_in_patch); + + // gauss_ordering_in_elem = DCArrayKokkos (num_patches_in_elem, num_gauss_in_patch); + + // printf("num_patches_in_elem = %zu \n", num_patches_in_elem); + // printf("num_nodes_in_patch = %zu \n", num_nodes_in_patch); + // printf("num_lobatto_in_patch = %zu \n", num_lobatto_in_patch); + // printf("num_gauss_in_patch = %zu \n", num_gauss_in_patch); + // printf("Number of surfaces = %zu \n", num_surfs_in_elem); + } + else { + num_patches_in_surf = Pn; + + num_patches_in_elem = num_patches_in_surf * num_surfs_in_elem; + + // nodes in a patch in the element + node_ordering_in_elem = DCArrayKokkos(num_patches_in_elem, num_nodes_in_patch, "node_ordering_in_elem"); + // lobatto_ordering_in_elem = DCArrayKokkos (num_patches_in_elem, num_lobatto_in_patch); + // gauss_ordering_in_elem = DCArrayKokkos (num_patches_in_elem, num_gauss_in_patch); + } // end if dim + + // On the CPU, set the node order for the patches in an element + // classic linear elements + if (elem_kind == mesh_init::linear_tensor_element) { + if (num_dims == 3) { + + size_t temp_node_lids[24] = { 0, 4, 6, 2, + 1, 3, 7, 5, + 0, 1, 5, 4, + 3, 2, 6, 7, + 0, 2, 3, 1, + 4, 5, 7, 6 }; + + int count = 0; + int elem_patch_lid = 0; + for (size_t surf_lid = 0; surf_lid < num_surfs_in_elem; surf_lid++) { + for (size_t patch_lid = 0; patch_lid < num_patches_in_surf; patch_lid++) { + for (size_t node_lid = 0; node_lid < num_nodes_in_patch; node_lid++) { + node_ordering_in_elem.host(elem_patch_lid, node_lid) = temp_node_lids[count]; + // gauss_ordering_in_elem.host( elem_patch_lid, node_lid ) = temp_node_lids[count]; + count++; + } // end for node_lid + elem_patch_lid++; + } // end for patch_lid in a surface + } // end for i + + // count = 0; + // elem_patch_lid = 0; + // for ( size_t surf_lid=0; surf_lid < num_surfs_in_elem; surf_lid++ ){ + // for ( size_t patch_lid=0; patch_lid < num_patches_in_surf; patch_lid++ ){ + // for ( size_t lobatto_lid=0; lobatto_lid < num_lobatto_in_patch; lobatto_lid++ ){ + // lobatto_ordering_in_elem.host( elem_patch_lid, lobatto_lid ) = temp_node_lids[count]; + // count++; + // } // end for node_lid + // elem_patch_lid ++; + // } // end for patch_lid in a surface + // } // end for i + } + else { + // J + // | + // 3---2 + // | | -- I + // 0---1 + // + size_t temp_node_lids[8] = + { 0, 3, + 1, 2, + 0, 1, + 3, 2 }; + + int count = 0; + int elem_patch_lid = 0; + for (size_t surf_lid = 0; surf_lid < num_surfs_in_elem; surf_lid++) { + for (size_t patch_lid = 0; patch_lid < num_patches_in_surf; patch_lid++) { + for (size_t node_lid = 0; node_lid < num_nodes_in_patch; node_lid++) { + node_ordering_in_elem.host(elem_patch_lid, node_lid) = temp_node_lids[count]; + // gauss_ordering_in_elem.host( elem_patch_lid, node_lid ) = temp_node_lids[count]; + count++; + } // end for node_lid + elem_patch_lid++; + } // end for patch_lid in a surface + } // end for i + } // end if on dims + } // end of linear element iwth classic numbering + // ----- + // arbitrary-order element + // ----- + else if (elem_kind == mesh_init::arbitrary_tensor_element) { + size_t temp_node_lids[num_nodes_in_patch * num_patches_in_surf * num_surfs_in_elem]; + + printf("arbitrary order tensor element \n"); + + // arbitrary-order node ordering in patches of an element + if (num_dims == 3) { + /* + + i,j,k layout + + k j + | / + |/ + o-->i + + + i=0,imax + o (j+1,k+1) + /| + (j,k+1) o o (j+1,k) + |/ + (j,k) o + + */ + + int count = 0; + + int i_patch, j_patch, k_patch; + + // i-minus-dir patches + + i_patch = 0; + for (int k = 0; k < num_1D - 1; k++) { + for (int j = 0; j < num_1D - 1; j++) { + // node_lid 0 in patch + // index = i + j*num_1D + k*num_1D*num_1D; + temp_node_lids[count] = i_patch + j * num_1D + k * num_1D * num_1D; // node_rid(i_patch, j, k, num_1D); + count++; + + // node_lid 1 in patch + // index = i + j*num_1D + (k+1)*num_1D*num_1D; + temp_node_lids[count] = i_patch + j * num_1D + (k + 1) * num_1D * num_1D; // node_rid(i_patch, j, k+1, num_1D); + count++; + + // node_lid 2 in patch + // index = i + (j+1)*num_1D + (k+1)*num_1D*num_1D; + temp_node_lids[count] = i_patch + (j + 1) * num_1D + (k + 1) * num_1D * num_1D; // node_rid(i_patch, j+1, k+1, num_1D); + count++; + + // node_lid 3 in patch + // index = i + (j+1)*num_1D + k*num_1D*num_1D; + temp_node_lids[count] = i_patch + (j + 1) * num_1D + k * num_1D * num_1D; // node_rid(i_patch, j+1, k, num_1D); + count++; + } // end for k + } // end for j + + // printf("i-minus\n"); + + // i-plus-dir patches + i_patch = num_1D - 1; + // printf("num_1D = %zu \n", num_1D); + // printf("i_patch = %d \n", i_patch); + printf("num_nodes_in_elem %zu \n", num_nodes_in_elem); + for (int k = 0; k < num_1D - 1; k++) { + for (int j = 0; j < num_1D - 1; j++) { + // node_lid 0 in patch + // index = i + j*num_1D + k*num_1D*num_1D; + temp_node_lids[count] = i_patch + j * num_1D + k * num_1D * num_1D; // node_rid(i_patch, j, k, num_1D); + count++; + + // node_lid 1 in patch + // index = i + (j+1)*num_1D + k*num_1D*num_1D; + temp_node_lids[count] = i_patch + (j + 1) * num_1D + k * num_1D * num_1D; // node_rid(i_patch, j+1, k, num_1D); + count++; + + // node_lid 2 in patch + // index = i + (j+1)*num_1D + (k+1)*num_1D*num_1D; + temp_node_lids[count] = i_patch + (j + 1) * num_1D + (k + 1) * num_1D * num_1D; // node_rid(i_patch, j+1, k+1, num_1D); + count++; + + // node_lid 3 in patch + // index = i + j*num_1D + (k+1)*num_1D*num_1D; + temp_node_lids[count] = i_patch + j * num_1D + (k + 1) * num_1D * num_1D; // node_rid(i_patch, j, k+1, num_1D); + count++; + } // end for j + } // end for k + + // printf("i-plus\n"); + + /* + + i,j,k layout + + k j + | / + |/ + o-->i + + + j=0,jmax + + (i,,k+1) o--o (i+1,,k+1) + | | + (i,,k) o--o (i+1,,k) + + */ + + j_patch = 0; + for (int k = 0; k < num_1D - 1; k++) { + for (int i = 0; i < num_1D - 1; i++) { + // node_lid 0 in patch + temp_node_lids[count] = i + j_patch * num_1D + k * num_1D * num_1D; // node_rid(i, j_patch, k, num_1D); + count++; + + // node_lid 1 in patch + temp_node_lids[count] = i + 1 + j_patch * num_1D + k * num_1D * num_1D; // node_rid(i+1, j_patch, k, num_1D); + count++; + + // node_lid 2 in patch + temp_node_lids[count] = i + 1 + j_patch * num_1D + (k + 1) * num_1D * num_1D; // node_rid(i+1, j_patch, k+1, num_1D); + count++; + + // node_lid 3 in patch + temp_node_lids[count] = i + j_patch * num_1D + (k + 1) * num_1D * num_1D; // node_rid(i, j_patch, k+1, num_1D); + count++; + } // end for i + } // end for k + + // printf("j-minus\n"); + + j_patch = num_1D - 1; + for (int k = 0; k < num_1D - 1; k++) { + for (int i = 0; i < num_1D - 1; i++) { + // node_lid 0 in patch + temp_node_lids[count] = i + j_patch * num_1D + k * num_1D * num_1D; // node_rid(i, j_patch, k, num_1D); + count++; + + // node_lid 1 in patch + temp_node_lids[count] = i + j_patch * num_1D + (k + 1) * num_1D * num_1D; // node_rid(i, j_patch, k+1, num_1D); + count++; + + // node_lid 2 in patch + temp_node_lids[count] = i + 1 + j_patch * num_1D + (k + 1) * num_1D * num_1D; // node_rid(i+1, j_patch, k+1, num_1D); + count++; + + // node_lid 3 in patch + temp_node_lids[count] = i + 1 + j_patch * num_1D + k * num_1D * num_1D; // node_rid(i+1, j_patch, k, num_1D); + count++; + } // end for i + } // end for k + + // printf("j-plus\n"); + + /* + + i,j,k layout + + k j + | / + |/ + o-->i + + + k=0,kmax + + (i,j+1) o--o (i+1,j+1) + / / + (i,j) o--o (i+1,j) + + */ + + k_patch = 0; + for (int j = 0; j < num_1D - 1; j++) { + for (int i = 0; i < num_1D - 1; i++) { + // node_lid 0 in patch + temp_node_lids[count] = i + j * num_1D + k_patch * num_1D * num_1D; // node_rid(i, j, k_patch, num_1D); + count++; + + // node_lid 1 in patch + temp_node_lids[count] = i + (j + 1) * num_1D + k_patch * num_1D * num_1D; // node_rid(i, j+1, k_patch, num_1D); + count++; + + // node_lid 2 in patch + temp_node_lids[count] = i + 1 + (j + 1) * num_1D + k_patch * num_1D * num_1D; // node_rid(i+1, j+1, k_patch, num_1D); + count++; + + // node_lid 3 in patch + temp_node_lids[count] = i + 1 + j * num_1D + k_patch * num_1D * num_1D; // node_rid(i+1, j, k_patch, num_1D); + count++; + } // end for i + } // end for j + // printf("k-minus\n"); + + k_patch = num_1D - 1; + for (int j = 0; j < num_1D - 1; j++) { + for (int i = 0; i < num_1D - 1; i++) { + // node_lid 0 in patch + temp_node_lids[count] = i + j * num_1D + k_patch * num_1D * num_1D; // node_rid(i, j, k_patch, num_1D); + count++; + + // node_lid 1 in patch + temp_node_lids[count] = i + 1 + j * num_1D + k_patch * num_1D * num_1D; // node_rid(i+1, j, k_patch, num_1D); + count++; + + // node_lid 2 in patch + temp_node_lids[count] = i + 1 + (j + 1) * num_1D + k_patch * num_1D * num_1D; // node_rid(i+1, j+1, k_patch, num_1D); + count++; + + // node_lid 3 in patch + temp_node_lids[count] = i + (j + 1) * num_1D + k_patch * num_1D * num_1D; // node_rid(i, j+1, k_patch, num_1D); + count++; + } // end for i + } // end for j + + // printf("k-plus\n"); + + count = 0; + int elem_patch_lid = 0; + for (size_t surf_lid = 0; surf_lid < 6; surf_lid++) { + for (size_t patch_lid = 0; patch_lid < num_patches_in_surf; patch_lid++) { + for (size_t node_lid = 0; node_lid < 4; node_lid++) { + node_ordering_in_elem.host(elem_patch_lid, node_lid) = temp_node_lids[count]; + count++; + } // end for node_lid + elem_patch_lid++; + } // end for patch_lid in a surface + } // end for i + } // end if 3D + // + else{ + // 2D arbitrary order elements + int count = 0; + int i_patch, j_patch; + + // i-minus-dir patches + + i_patch = 0; + for (int j = 0; j < num_1D - 1; j++) { + temp_node_lids[count] = i_patch + j * num_1D; // node_rid(i_patch, j, num_1D; + count++; + + temp_node_lids[count] = i_patch + (j + 1) * num_1D; // node_rid(i_patch, j+1, num_1D; + count++; + } // end for j + + // i-plus-dir patches + i_patch = num_1D - 1; + for (int j = 0; j < num_1D - 1; j++) { + temp_node_lids[count] = i_patch + j * num_1D; // node_rid(i_patch, j, num_1D; + count++; + + temp_node_lids[count] = i_patch + (j + 1) * num_1D; // node_rid(i_patch, j+1, num_1D; + count++; + } // end for j + + j_patch = 0; + for (int i = 0; i < num_1D - 1; i++) { + temp_node_lids[count] = i + j_patch * num_1D; // node_rid(i, j_patch, num_1D); + count++; + + temp_node_lids[count] = i + 1 + j_patch * num_1D; // node_rid(i+1, j_patch, num_1D); + count++; + } // end for i + + j_patch = num_1D - 1; + for (int i = 0; i < num_1D - 1; i++) { + temp_node_lids[count] = i + j_patch * num_1D; // node_rid(i, j_patch, num_1D); + count++; + + temp_node_lids[count] = i + 1 + j_patch * num_1D; // node_rid(i+1, j_patch, num_1D); + count++; + } // end for i + + count = 0; + int elem_patch_lid = 0; + for (size_t surf_lid = 0; surf_lid < num_surfs_in_elem; surf_lid++) { + for (size_t patch_lid = 0; patch_lid < num_patches_in_surf; patch_lid++) { + for (size_t node_lid = 0; node_lid < num_nodes_in_patch; node_lid++) { + node_ordering_in_elem.host(elem_patch_lid, node_lid) = temp_node_lids[count]; + count++; + } // end for node_lid + elem_patch_lid++; + } // end for patch_lid in a surface + } // end for i + } // end else on dim + + // build zones in high order element + FOR_ALL_CLASS(elem_gid, 0, num_elems, { + size_t node_lids[8]; // temp storage for local node ids + for (int k = 0; k < num_1D - 1; k++) { + for (int j = 0; j < num_1D - 1; j++) { + for (int i = 0; i < num_1D - 1; i++) { + node_lids[0] = i + j * (num_1D) + k * (num_1D) * (num_1D); // i,j,k + node_lids[1] = i + 1 + j * (num_1D) + k * (num_1D) * (num_1D); // i+1, j, k + node_lids[2] = i + (j + 1) * (num_1D) + k * (num_1D) * (num_1D); // i,j+1,k + node_lids[3] = i + 1 + (j + 1) * (num_1D) + k * (num_1D) * (num_1D); // i+1, j+1, k + node_lids[4] = i + j * (num_1D) + (k + 1) * (num_1D) * (num_1D); // i, j , k+1 + node_lids[5] = i + 1 + j * (num_1D) + (k + 1) * (num_1D) * (num_1D); // i + 1, j , k+1 + node_lids[6] = i + (j + 1) * (num_1D) + (k + 1) * (num_1D) * (num_1D); // i,j+1,k+1 + node_lids[7] = i + 1 + (j + 1) * (num_1D) + (k + 1) * (num_1D) * (num_1D); // i+1, j+1, k+1 + + size_t zone_lid = i + j * (num_1D - 1) + k * (num_1D - 1) * (num_1D - 1); + size_t zone_gid = zones_in_elem(elem_gid, zone_lid); + + for (int node_lid = 0; node_lid < 8; node_lid++) { + // get global id for the node + size_t node_gid = nodes_in_elem(elem_gid, node_lids[node_lid]); + nodes_in_zone(zone_gid, node_lid) = node_gid; + } + } // i + } // j + } // k + }); // end FOR_ALL elem_gid + } // end if arbitrary-order element + else { + printf("\nERROR: mesh type is not known \n"); + } // end if + + // update the device + node_ordering_in_elem.update_device(); + Kokkos::fence(); + + printf("Built node ordering \n"); + + // for saving the hash keys of the patches and then the neighboring elem_gid + CArrayKokkos hash_keys_in_elem(num_elems, num_patches_in_elem, num_nodes_in_patch, "hash_keys_in_elem"); // always 4 ids in 3D + + // for saving the adjacent patch_lid, which is the slide_lid + // CArrayKokkos neighboring_side_lids (num_elems, num_patches_in_elem); + + // allocate memory for the patches in the elem + patches_in_elem = CArrayKokkos(num_elems, num_patches_in_elem, "mesh.patches_in_elem"); + + // a temporary storage for the patch_gids that are on the mesh boundary + CArrayKokkos temp_bdy_patches(num_elems * num_patches_in_elem, "temp_bdy_patches"); + + // step 1) calculate the hash values for each patch in the element + FOR_ALL_CLASS(elem_gid, 0, num_elems, { + for (size_t patch_lid = 0; patch_lid < num_patches_in_elem; patch_lid++) { + size_t sorted_patch_nodes[4]; // note: cannot be allocated with num_nodes_in_patch + + // first save the patch nodes + for (size_t patch_node_lid = 0; patch_node_lid < num_nodes_in_patch; patch_node_lid++) { + // get the local node index of the element for this patch and node in patch + size_t node_lid = node_ordering_in_elem(patch_lid, patch_node_lid); + + // get and save the global index of the node + sorted_patch_nodes[patch_node_lid] = nodes_in_elem(elem_gid, node_lid); + } // end for node_lid + + // sort nodes from smallest to largest + bubble_sort(sorted_patch_nodes, num_nodes_in_patch); + + // save hash_keys in the this elem + for (size_t key_lid = 0; key_lid < num_nodes_in_patch; key_lid++) { + hash_keys_in_elem(elem_gid, patch_lid, key_lid) = sorted_patch_nodes[key_lid]; // 4 node values are keys + } // for + } // end for patch_lid + }); // end FOR_ALL elem_gid + + DCArrayKokkos num_values(2, "num_values"); + + // 8x8x8 mesh + // num_patches = 8*8*9*3 = 1728 + // bdy_patches = 8*8*6 = 384 + // + + // step 2: walk around the elements and save the elem pairs that have the same hash_key + RUN_CLASS({ + // serial execution on GPU + + size_t patch_gid = 0; + size_t bdy_patch_gid = 0; + + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { + // loop over the patches in this elem + for (size_t patch_lid = 0; patch_lid < num_patches_in_elem; patch_lid++) { + size_t exit = 0; + + // negative values mean the patch has not been saved + if (hash_keys_in_elem(elem_gid, patch_lid, 0) >= 0) { + // find the nighboring patch with the same hash_key + + for (size_t neighbor_elem_lid = 0; neighbor_elem_lid < num_elems_in_elem(elem_gid); neighbor_elem_lid++) { + // get the neighboring element global index + size_t neighbor_elem_gid = elems_in_elem(elem_gid, neighbor_elem_lid); + + for (size_t neighbor_patch_lid = 0; neighbor_patch_lid < num_patches_in_elem; neighbor_patch_lid++) { + size_t save_it = 0; + for (size_t key_lid = 0; key_lid < num_nodes_in_patch; key_lid++) { + if (hash_keys_in_elem(neighbor_elem_gid, neighbor_patch_lid, key_lid) == hash_keys_in_elem(elem_gid, patch_lid, key_lid)) { + save_it++; // if save_it == num_nodes after this loop, then it is a match + } + } // end key loop + + // this hash is from the nodes on the patch + if (save_it == num_nodes_in_patch) { + // make it negative, because we saved it + hash_keys_in_elem(elem_gid, patch_lid, 0) = -1; + hash_keys_in_elem(neighbor_elem_gid, neighbor_patch_lid, 0) = -1; + + // save the patch_lids for the adjacent sides + // neighboring_side_lids(elem_gid, patch_lid) = neighbor_patch_lid; + // neighboring_side_lids(neighbor_elem_gid, neighbor_patch_lid) = patch_lid; + + // save the patch_gid + patches_in_elem(elem_gid, patch_lid) = patch_gid; + patches_in_elem(neighbor_elem_gid, neighbor_patch_lid) = patch_gid; + + patch_gid++; + + exit = 1; + break; + } // end if + } // end for loop over a neighbors patch set + + if (exit == 1) { + break; + } + } // end for loop over elem neighbors + } // end if hash<0 + } // end for patch_lid + + // loop over the patches in this element again + // remaining positive hash key values are the boundary patches + for (size_t patch_lid = 0; patch_lid < num_patches_in_elem; patch_lid++) { + if (hash_keys_in_elem(elem_gid, patch_lid, 0) >= 0) { + hash_keys_in_elem(elem_gid, patch_lid, 0) = -1; // make it negative, because we saved it + + // neighboring_side_lids(elem_gid, patch_lid) = patch_lid; + + patches_in_elem(elem_gid, patch_lid) = patch_gid; + temp_bdy_patches(bdy_patch_gid) = patch_gid; + + patch_gid++; + bdy_patch_gid++; + } // end if + } // end for over patch_lid + } // end for over elem_gid + + // the num_values is because the values passed in are const, so a const pointer is needed + num_values(0) = patch_gid; // num_patches = patch_gid; + num_values(1) = bdy_patch_gid; // num_bdy_patches = bdy_patch_gid; + }); // end RUN + Kokkos::fence(); + + num_values.update_host(); + Kokkos::fence(); + + num_patches = num_values.host(0); + // this lines assumes num_surfs == num_patches, only valid for 1st order elements + num_surfs = num_values.host(0); + num_bdy_patches = num_values.host(1); + + // size_t mesh_1D = 60; + // size_t exact_num_patches = (mesh_1D*mesh_1D)*(mesh_1D+1)*3; + // size_t exact_num_bdy_patches = (mesh_1D*mesh_1D)*6; + // printf("num_patches = %lu, exact = %lu \n", num_patches, exact_num_patches); + // printf("num_bdy_patches = %lu exact = %lu \n", num_bdy_patches, exact_num_bdy_patches); + + // printf("Num patches = %lu \n", num_patches); + // printf("Num boundary patches = %lu \n", num_bdy_patches); + + elems_in_patch = CArrayKokkos(num_patches, 2, "mesh.elems_in_patch"); + nodes_in_patch = CArrayKokkos(num_patches, num_nodes_in_patch, "mesh.nodes_in_patch"); + + // a temporary variable to help populate patch structures + CArrayKokkos num_elems_in_patch_saved(num_patches, "num_elems_in_patch_saved"); + + // initialize the number of elems in a patch saved to zero + FOR_ALL_CLASS(patch_gid, 0, num_patches, { + num_elems_in_patch_saved(patch_gid) = 0; + }); + + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { + FOR_ALL_CLASS(patch_lid, 0, num_patches_in_elem, { + size_t patch_gid = patches_in_elem(elem_gid, patch_lid); + + size_t num_saved = num_elems_in_patch_saved(patch_gid); + + elems_in_patch(patch_gid, num_saved) = elem_gid; + + // record that an elem_gid was saved + num_elems_in_patch_saved(patch_gid)++; + + // save the nodes on this patch + for (size_t patch_node_lid = 0; patch_node_lid < num_nodes_in_patch; patch_node_lid++) { + // get the local node index of the element for this patch and node in patch + size_t node_lid = node_ordering_in_elem(patch_lid, patch_node_lid); + + // get and save the global index of the node + nodes_in_patch(patch_gid, patch_node_lid) = nodes_in_elem(elem_gid, node_lid); + } // end for node_lid + }); // end FOR_ALL patch_lid + } // end for + + // Surfaces and patches in surface + if (elem_kind == mesh_init::arbitrary_tensor_element) { + // allocate memory for the surfaces in the elem + surfs_in_elem = CArrayKokkos(num_elems, num_surfs_in_elem); + + // allocate memory for surface data structures + num_surfs = num_patches / num_patches_in_surf; + + patches_in_surf = CArrayKokkos(num_surfs, num_patches_in_surf, "mesh.patches_in_surf"); + elems_in_surf = CArrayKokkos(num_surfs, 2, "mesh.elems_in_surf"); + surf_in_patch = CArrayKokkos(num_patches, "mesh.surf_in_patch"); + + FOR_ALL_CLASS(surf_gid, 0, num_surfs, { + // loop over the patches in this surface + for (size_t patch_lid = 0; patch_lid < num_patches_in_surf; patch_lid++) { + // get patch_gid + size_t patch_gid = patch_lid + surf_gid * num_patches_in_surf; + + // save the patch_gids + patches_in_surf(surf_gid, patch_lid) = patch_gid; + + // save the surface this patch belongs to + surf_in_patch(patch_gid) = surf_gid; + } // end for + + // get first patch in the surface, and populate elem surface structures + size_t this_patch_gid = surf_gid * num_patches_in_surf; + + elems_in_surf(surf_gid, 0) = elems_in_patch(this_patch_gid, 0); // elem_gid0 + elems_in_surf(surf_gid, 1) = elems_in_patch(this_patch_gid, 1); // elem_gid1 + }); // end FOR_ALL over surfaces + + // save surfaces in elem + FOR_ALL_CLASS(elem_gid, 0, num_elems, { + for (size_t surf_lid = 0; surf_lid < num_surfs_in_elem; surf_lid++) { + // get the local patch_lid + size_t patch_lid = surf_lid * num_patches_in_surf; + + // get the patch_gids in this element + size_t patch_gid = patches_in_elem(elem_gid, patch_lid); + + // save the surface gid + // Grab the first patch on surf and return surface_gid from surf_in_patch // + surfs_in_elem(elem_gid, surf_lid) = surf_in_patch(patch_gid); + } // end surf_lid + }); + + DViewCArrayKokkos surf_node_ordering_in_elem; + + if (num_dims == 3) { + // num_1D = Pn+1 + int num_surface_nodes = num_surfs_in_elem * pow(num_1D, num_dims - 1); + size_t temp_surf_node_lids[num_surface_nodes]; + // 2D arbitrary order elements + int count = 0; + + for (int i_surf = 0; i_surf < 2; i_surf++) { + for (int k = 0; k < num_1D; k++) { + for (int j = 0; j < num_1D; j++) { + // node_lid 0 in patch + // index = i + j*num_1D + k*num_1D*num_1D; + temp_surf_node_lids[count] = i_surf + j * num_1D + k * num_1D * num_1D; + count++; + } // end for k + } // end for j + } + + for (int j_surf = 0; j_surf < 2; j_surf++) { + for (int k = 0; k < num_1D; k++) { + for (int i = 0; i < num_1D; i++) { + // node_lid 0 in patch + temp_surf_node_lids[count] = i + j_surf * num_1D + k * num_1D * num_1D; + count++; + } + } + } + + for (int k_surf = 0; k_surf < 2; k_surf++) { + for (int j = 0; j < num_1D; j++) { + for (int i = 0; i < num_1D; i++) { + // node_lid 0 in patch + temp_surf_node_lids[count] = i + j * num_1D + k_surf * num_1D * num_1D; + count++; + } + } + } + + nodes_in_surf = CArrayKokkos(num_surfs, num_1D * num_1D, "mesh.nodes_in_surf"); + + num_nodes_in_surf = num_1D * num_1D; + surf_node_ordering_in_elem = DViewCArrayKokkos(&temp_surf_node_lids[0], num_surfs_in_elem, num_nodes_in_surf); + surf_node_ordering_in_elem.update_device(); + for (int elem_gid = 0; elem_gid < num_elems; elem_gid++) { + FOR_ALL_CLASS(surf_lid, 0, num_surfs_in_elem, { + int surf_gid = surfs_in_elem(elem_gid, surf_lid); + for (int surf_node_lid = 0; surf_node_lid < num_nodes_in_surf; surf_node_lid++) { + int node_lid = surf_node_ordering_in_elem(surf_lid, surf_node_lid); + int node_gid = nodes_in_elem(elem_gid, node_lid); + nodes_in_surf(surf_gid, surf_node_lid) = node_gid; + } // end loop over surf_node_lid + }); // end loop over FOR_ALL_CLASS + } // end loop over elem_gid + } // end 3D scope + } // end of high-order mesh objects + + // ---------------- + + // allocate memory for boundary patches + bdy_patches = CArrayKokkos(num_bdy_patches, "mesh.bdy_patches"); + + FOR_ALL_CLASS(bdy_patch_gid, 0, num_bdy_patches, { + bdy_patches(bdy_patch_gid) = temp_bdy_patches(bdy_patch_gid); + }); // end FOR_ALL bdy_patch_gid + + // find and store the boundary nodes + CArrayKokkos temp_bdy_nodes(num_nodes, "temp_bdy_nodes"); + CArrayKokkos hash_bdy_nodes(num_nodes, "hash_bdy_nodes"); + + FOR_ALL_CLASS(node_gid, 0, num_nodes, { + hash_bdy_nodes(node_gid) = -1; + }); // end for node_gid + + // Parallel loop over boundary patches + DCArrayKokkos num_bdy_nodes_saved(1, "num_bdy_nodes_saved"); + + RUN_CLASS({ + num_bdy_nodes_saved(0) = 0; + for (size_t bdy_patch_gid = 0; bdy_patch_gid < num_bdy_patches; bdy_patch_gid++) { + // get the global index of the patch that is on the boundary + size_t patch_gid = bdy_patches(bdy_patch_gid); + + // tag the boundary nodes + for (size_t node_lid = 0; node_lid < num_nodes_in_patch; node_lid++) { + size_t node_gid = nodes_in_patch(patch_gid, node_lid); + + if (hash_bdy_nodes(node_gid) < 0) { + hash_bdy_nodes(node_gid) = node_gid; + temp_bdy_nodes(num_bdy_nodes_saved(0)) = node_gid; + + // printf("bdy_node = %lu \n", node_gid); + num_bdy_nodes_saved(0)++; + } // end if + } // end for node_lid + } // end for loop over bdy_patch_gid + }); // end RUN + Kokkos::fence(); + + // copy value to host (CPU) + num_bdy_nodes_saved.update_host(); + Kokkos::fence(); + + // save the number of bdy_nodes to Mesh_t + num_bdy_nodes = num_bdy_nodes_saved.host(0); + + bdy_nodes = CArrayKokkos(num_bdy_nodes, "mesh.bdy_nodes"); + + FOR_ALL_CLASS(node_gid, 0, num_bdy_nodes, { + bdy_nodes(node_gid) = temp_bdy_nodes(node_gid); + }); // end for boundary node_gid + + // printf("Num boundary nodes = %lu \n", num_bdy_nodes); + + return; + } // end patch connectivity method + + // build the patches + void build_node_node_connectivity() + { + // find the max number of elems around a node + size_t max_num_elems_in_node; + size_t max_num_lcl; + FOR_REDUCE_MAX_CLASS(node_gid, 0, num_nodes, max_num_lcl, { + // num_corners_in_node = num_elems_in_node + size_t max_num = num_corners_in_node(node_gid); + + if (max_num > max_num_lcl) { + max_num_lcl = max_num; + } + }, max_num_elems_in_node); // end parallel reduction on max + Kokkos::fence(); + + // each elem corner will contribute 3 edges to the node. Those edges will likely be the same + // ones from an adjacent element so it is a safe estimate to multiply by 3 + DynamicRaggedRightArrayKokkos temp_nodes_in_nodes(num_nodes, max_num_elems_in_node * 3, "temp_nodes_in_nodes"); + + num_nodes_in_node = CArrayKokkos(num_nodes, "mesh.num_nodes_in_node"); + + // walk over the patches and save the node node connectivity + RUN_CLASS({ + if (num_dims == 3) { + for (size_t patch_gid = 0; patch_gid < num_patches; patch_gid++) { + for (size_t node_lid = 0; node_lid < num_nodes_in_patch; node_lid++) { + // the first node on the edge + size_t node_gid_0 = nodes_in_patch(patch_gid, node_lid); + + // second node on this edge + size_t node_gid_1; + + if (node_lid == num_nodes_in_patch - 1) { + node_gid_1 = nodes_in_patch(patch_gid, 0); + } + else { + node_gid_1 = nodes_in_patch(patch_gid, node_lid + 1); + } // end if + + size_t num_saved_0 = temp_nodes_in_nodes.stride(node_gid_0); + size_t num_saved_1 = temp_nodes_in_nodes.stride(node_gid_1); + + size_t save_0 = 1; + size_t save_1 = 1; + + // check to see if the node_gid_1 was already saved + for (size_t contents_lid = 0; contents_lid < num_saved_0; contents_lid++) { + if (temp_nodes_in_nodes(node_gid_0, contents_lid) == node_gid_1) { + save_0 = 0; // don't save, it was already saved + } + } + + // check to see if the node_gid_0 was already saved + for (size_t contents_lid = 0; contents_lid < num_saved_1; contents_lid++) { + if (temp_nodes_in_nodes(node_gid_1, contents_lid) == node_gid_0) { + save_1 = 0; // don't save, it was already saved + } + } + + if (save_0 == 1) { + // increment the number of nodes in a node saved + temp_nodes_in_nodes.stride(node_gid_0)++; + + // save the second node to the first node + temp_nodes_in_nodes(node_gid_0, num_saved_0) = node_gid_1; + } + + if (save_1 == 1) { + // increment the number of nodes in a node saved + temp_nodes_in_nodes.stride(node_gid_1)++; + + // save the first node to the second node + temp_nodes_in_nodes(node_gid_1, num_saved_1) = node_gid_0; + } + + // save the strides + num_nodes_in_node(node_gid_0) = temp_nodes_in_nodes.stride(node_gid_0); + num_nodes_in_node(node_gid_1) = temp_nodes_in_nodes.stride(node_gid_1); + } // end for node in patch + } // end for patches + } // end if 3D + else { + for (size_t patch_gid = 0; patch_gid < num_patches; patch_gid++) { + // the first node on the edge + size_t node_gid_0 = nodes_in_patch(patch_gid, 0); + + // second node on this edge + size_t node_gid_1 = nodes_in_patch(patch_gid, 1); + + size_t num_saved_0 = temp_nodes_in_nodes.stride(node_gid_0); + size_t num_saved_1 = temp_nodes_in_nodes.stride(node_gid_1); + + // increment the number of nodes in a node saved + temp_nodes_in_nodes.stride(node_gid_0)++; + temp_nodes_in_nodes.stride(node_gid_1)++; + + // save the second node to the first node + temp_nodes_in_nodes(node_gid_0, num_saved_0) = node_gid_1; + + // save the first node to the second node + temp_nodes_in_nodes(node_gid_1, num_saved_1) = node_gid_0; + + // save the strides + num_nodes_in_node(node_gid_0) = temp_nodes_in_nodes.stride(node_gid_0); + num_nodes_in_node(node_gid_1) = temp_nodes_in_nodes.stride(node_gid_1); + } // end for patches + } // end if 2D + }); // end RUN + Kokkos::fence(); + + nodes_in_node = RaggedRightArrayKokkos(num_nodes_in_node, "mesh.nodes_in_node"); + + // save the connectivity + FOR_ALL_CLASS(node_gid, 0, num_nodes, { + size_t num_saved = 0; + for (size_t node_lid = 0; node_lid < num_nodes_in_node(node_gid); node_lid++) { + nodes_in_node(node_gid, num_saved) = temp_nodes_in_nodes(node_gid, num_saved); + + // increment the number of nodes in node saved + num_saved++; + } // end for node_lid + }); // end parallel for over nodes + } // end of node node connectivity + + ///////////////////////////////////////////////////////////////////////////// + /// + /// \fn build_connectivity + /// + /// \brief Calls multiple build connectivity function + /// + ///////////////////////////////////////////////////////////////////////////// + void build_connectivity() + { + build_corner_connectivity(); + printf("Built corner connectivity \n"); + + build_elem_elem_connectivity(); + printf("Built element-element connectivity \n"); + + build_patch_connectivity(); + printf("Built patch connectivity \n"); + + build_node_node_connectivity(); + printf("Built node-node connectivity \n"); + } + + ///////////////////////////////////////////////////////////////////////////// + /// + /// \fn init_bdy_sets + /// + /// \brief Initialize memory for boundary sets + /// + ///////////////////////////////////////////////////////////////////////////// + void init_bdy_sets(size_t num_bcs) + { + // if (num_bcs == 0) { + // printf("ERROR: number of boundary sets = 0, set it = 1"); + // num_bcs = 1; + // } + num_bdy_sets = num_bcs; + num_bdy_patches_in_set = DCArrayKokkos(num_bcs, "mesh.num_bdy_patches_in_set"); + + // bdy_patches_in_set is a raggedRight array, it is allocated + // in tag_bdys fcn after the sparsity is known, see geometry_new.cpp + + return; + } // end of init_bdy_sets method + + +}; // end Mesh_t + +#endif \ No newline at end of file diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp new file mode 100644 index 00000000..dd26b631 --- /dev/null +++ b/examples/mesh_decomp/mesh_decomp.cpp @@ -0,0 +1,32 @@ +#include +#include +#include +#include +#include +#include + +// Include Scotch headers +#include "scotch.h" +#include "ptscotch.h" + + +struct initial_mesh_t { + int num_elems; // Number of elements + + std::vector nodes_in_elem; // Nodes in an element + std::vector elems_in_elem; // Elements in an element + + std::vector verttab; // Start index in edgetab for each element (size num_elems+1) + std::vector edgetab; // Adjacency info: neighboring element indices +}; + + +int main(int argc, char** argv) { + + initial_mesh_t initial_mesh; + + + + + return 0; +} \ No newline at end of file diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h new file mode 100644 index 00000000..03fee676 --- /dev/null +++ b/examples/mesh_decomp/mesh_io.h @@ -0,0 +1,4894 @@ +/********************************************************************************************** +© 2020. Triad National Security, LLC. All rights reserved. +This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos +National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S. +Department of Energy/National Nuclear Security Administration. All rights in the program are +reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear +Security Administration. The Government is granted for itself and others acting on its behalf a +nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare +derivative works, distribute copies to the public, perform publicly and display publicly, and +to permit others to do so. +This program is open source under the BSD-3 License. +Redistribution and use in source and binary forms, with or without modification, are permitted +provided that the following conditions are met: +1. Redistributions of source code must retain the above copyright notice, this list of +conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, this list of +conditions and the following disclaimer in the documentation and/or other materials +provided with the distribution. +3. Neither the name of the copyright holder nor the names of its contributors may be used +to endorse or promote products derived from this software without specific prior +written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************************/ +#ifndef FIERRO_IO_H +#define FIERRO_IO_H + +#include "matar.h" +#include "mesh.h" +#include "state.h" +#include "simulation_parameters.h" +#include "region.h" +#include "string_utils.h" + +#include +#include +#include +#include +#include +#include // for string pattern recoginition +#include +#include +#include +#include + + + +///////////////////////////////////////////////////////////////////////////// +/// +/// \fn get_id +/// +/// \brief This gives the index value of the point or the elem +/// +/// Assumes that the grid has an i,j,k structure +/// the elem = i + (j)*(num_points_i-1) + (k)*(num_points_i-1)*(num_points_j-1) +/// the point = i + (j)*num_points_i + (k)*num_points_i*num_points_j +/// +/// \param i index +/// \param j index +/// \param k index +/// \param Number of i indices +/// \param Number of j indices +/// +///////////////////////////////////////////////////////////////////////////// +inline int get_id(int i, int j, int k, int num_i, int num_j) +{ + return i + j * num_i + k * num_i * num_j; +} + +///////////////////////////////////////////////////////////////////////////// +/// +/// \fn PointIndexFromIJK +/// +/// \brief Given (i,j,k) coordinates within the Lagrange hex, return an +/// offset into the local connectivity (PointIds) array. The order parameter +/// must point to an array of 3 integers specifying the order along each +/// axis of the hexahedron. +/// +///////////////////////////////////////////////////////////////////////////// +inline int PointIndexFromIJK(int i, int j, int k, const int* order) +{ + bool ibdy = (i == 0 || i == order[0]); + bool jbdy = (j == 0 || j == order[1]); + bool kbdy = (k == 0 || k == order[2]); + // How many boundaries do we lie on at once? + int nbdy = (ibdy ? 1 : 0) + (jbdy ? 1 : 0) + (kbdy ? 1 : 0); + + if (nbdy == 3) { // Vertex DOF + // ijk is a corner node. Return the proper index (somewhere in [0,7]): + return (i ? (j ? 2 : 1) : (j ? 3 : 0)) + (k ? 4 : 0); + } + + int offset = 8; + if (nbdy == 2) { // Edge DOF + if (!ibdy) { // On i axis + return (i - 1) + (j ? order[0] - 1 + order[1] - 1 : 0) + (k ? 2 * (order[0] - 1 + order[1] - 1) : 0) + offset; + } + if (!jbdy) { // On j axis + return (j - 1) + (i ? order[0] - 1 : 2 * (order[0] - 1) + order[1] - 1) + (k ? 2 * (order[0] - 1 + order[1] - 1) : 0) + offset; + } + // !kbdy, On k axis + offset += 4 * (order[0] - 1) + 4 * (order[1] - 1); + return (k - 1) + (order[2] - 1) * (i ? (j ? 3 : 1) : (j ? 2 : 0)) + offset; + } + + offset += 4 * (order[0] - 1 + order[1] - 1 + order[2] - 1); + if (nbdy == 1) { // Face DOF + if (ibdy) { // On i-normal face + return (j - 1) + ((order[1] - 1) * (k - 1)) + (i ? (order[1] - 1) * (order[2] - 1) : 0) + offset; + } + offset += 2 * (order[1] - 1) * (order[2] - 1); + if (jbdy) { // On j-normal face + return (i - 1) + ((order[0] - 1) * (k - 1)) + (j ? (order[2] - 1) * (order[0] - 1) : 0) + offset; + } + offset += 2 * (order[2] - 1) * (order[0] - 1); + // kbdy, On k-normal face + return (i - 1) + ((order[0] - 1) * (j - 1)) + (k ? (order[0] - 1) * (order[1] - 1) : 0) + offset; + } + + // nbdy == 0: Body DOF + offset += 2 * ( (order[1] - 1) * (order[2] - 1) + (order[2] - 1) * (order[0] - 1) + (order[0] - 1) * (order[1] - 1)); + return offset + (i - 1) + (order[0] - 1) * ( (j - 1) + (order[1] - 1) * ( (k - 1))); +} + +///////////////////////////////////////////////////////////////////////////// +/// +/// \fn get_id_device +/// +/// \brief This gives the index value of the point or the elem +/// +/// Assumes that the grid has an i,j,k structure +/// the elem = i + (j)*(num_points_i-1) + (k)*(num_points_i-1)*(num_points_j-1) +/// the point = i + (j)*num_points_i + (k)*num_points_i*num_points_j +/// +/// \param i index +/// \param j index +/// \param k index +/// \param Number of i indices +/// \param Number of j indices +/// +///////////////////////////////////////////////////////////////////////////// +KOKKOS_INLINE_FUNCTION +int get_id_device(int i, int j, int k, int num_i, int num_j) +{ + return i + j * num_i + k * num_i * num_j; +} + + +//------- +// word is the field name e.g., Offsets, connectivity, etc. +// stop is the phrase to stop extracting values +template +inline bool extract_values_xml(T *values_xml, + const std::string& word, + const std::string& stop, + std::ifstream& in, + size_t& size) +{ + + bool found = false; + + std::string line; + + size_t i = 0; + + // Read the file line by line looking for specified word + while (std::getline(in, line)) { + + if (line.find(word) != std::string::npos) { // Check if the portion of the word is in the line + found = true; + } + if(found) { + + // loop over the lines in the file, extracting the values of the field corresponding to the word + while (std::getline(in, line)){ + + std::istringstream iss(line); // Create a stream from the line + + // extract the individual values from the stream + T value; + while (iss >> value) { + values_xml[i] = value; + i++; + } // end while + + if (line.find(stop) != std::string::npos) { // Check if the stop word is in the line + break; + } // end if + + } // end while + + if(found) break; + + } // end if found + + } // end while + + size = i; + + return found; + +} // end function + + +// find the number of points and number of cells in the mesh +inline bool extract_num_points_and_cells_xml(int& numberOfPoints, + int& numberOfCells, + std::ifstream& in) +{ + bool found = false; + + std::string line; + + + // Read the file line by line looking for NumberOfPoints + while (std::getline(in, line)) { + + std::string word = "NumberOfPoints="; // A portion of a word + + if (line.find(word) != std::string::npos) { // Check if the portion of the word is in the line + found = true; + } + if(found) { + // Define regex pattern to match the attributes and capture values + std::regex pattern(R"(NumberOfPoints=\"(\d+)\" NumberOfCells=\"(\d+)\")"); + std::smatch match; + + if (std::regex_search(line, match, pattern)) { + //std::cout << "Number of nodes in mesh file: " << match[1] << std::endl; + //std::cout << "Number of cells in mesh file: " << match[2] << std::endl; + + numberOfPoints = std::stoi(match[1].str()); + numberOfCells = std::stoi(match[2].str()); + + } else { + std::cout << "Error reading the number of points and cells in the mesh!" << std::endl; + } + + break; + } // end if + + } // end while + + return found; + +} // end function + + +// 8 = pixal i,j,k linear quad ording +// 9 = linear quad ensight ordering +// 11 = voxel i,j,k linear hex ording +// 12 = linear ensight hex ordering +// 72 = VTK_LAGRANGE_HEXAHEDRON +namespace element_types +{ + enum element_name + { + linear_quad_ijk = 8, + linear_quad = 9, + linear_hex_ijk = 11, + linear_hex = 12, + arbitrary_hex = 72 + }; +} + +///////////////////////////////////////////////////////////////////////////// +/// +/// \class MeshReader +/// +/// \brief Class for simplifying reading meshes +/// +/// This class contains the requisite functions required to read different +/// mesh formats. The idea is to set the mesh file name, and parse the +/// extension to decide which reader to use. Currently, only ensight .geo +/// files are supported. +/// +///////////////////////////////////////////////////////////////////////////// +class MeshReader +{ +private: + // Handy structs for parsing input meshes + struct Node { + int id; + double x, y, z; + }; + + struct Element { + int id; + std::vector connectivity; + }; + +public: + + char* mesh_file_ = NULL; + + MeshReader() {} // Simulation_Parameters& _simparam); + + ~MeshReader() = default; + + ///////////////////////////////////////////////////////////////////////////// + /// + /// \fn set_mesh_file + /// + /// \brief Sets the mesh file path for reading in a mesh + /// + /// \param Path to mesh file + /// + ///////////////////////////////////////////////////////////////////////////// + void set_mesh_file(char* MESH) + { + mesh_file_ = MESH; + } + + // Reads and initializes the mesh and geometric state entities + ///////////////////////////////////////////////////////////////////////////// + /// + /// \fn read_mesh + /// + /// \brief Read mesh from file + /// + /// \param Simulation mesh + /// \param Simulation state + /// \param Number of dimensions + /// + /// + ///////////////////////////////////////////////////////////////////////////// + void read_mesh(Mesh_t& mesh, + State_t& State, + mesh_input_t& mesh_inps, + int num_dims) + { + if (mesh_file_ == NULL) { + throw std::runtime_error("**** No mesh path given for read_mesh ****"); + } + + std::ifstream file(mesh_file_); + if (file.is_open()) { + std::cout << "The file exists." << std::endl; + file.close(); + } else { + throw std::runtime_error("**** Mesh path given does not exists ****"); + } + + // Check mesh file extension + // and read based on extension + std::string filePathStr(mesh_file_); + std::string extension; + + size_t pos = filePathStr.rfind('.'); + if (pos != std::string::npos) { + extension = filePathStr.substr(pos + 1); + } else { + extension = ""; + } + + std::cout << "File extension is: " << extension << std::endl; + + if(extension == "geo"){ // Ensight meshfile extension + read_ensight_mesh(mesh, State.GaussPoints, State.node, State.corner, mesh_inps, num_dims); + } + else if(extension == "inp"){ // Abaqus meshfile extension + read_Abaqus_mesh(mesh, State, num_dims); + } + else if(extension == "vtk"){ // vtk file format + read_vtk_mesh(mesh, State.GaussPoints, State.node, State.corner, mesh_inps, num_dims); + } + else if(extension == "vtu"){ // vtu file format + read_vtu_mesh(mesh, State.GaussPoints, State.node, State.corner, mesh_inps, num_dims); + } + else{ + throw std::runtime_error("**** Mesh file extension not understood ****"); + } + + } + + ///////////////////////////////////////////////////////////////////////////// + /// + /// \fn read_ensight_mesh + /// + /// \brief Read .geo mesh file + /// + /// \param Simulation mesh + /// \param Element state struct + /// \param Node state struct + /// \param Corner state struct + /// \param Number of dimensions + /// + ///////////////////////////////////////////////////////////////////////////// + void read_ensight_mesh(Mesh_t& mesh, + GaussPoint_t& GaussPoints, + node_t& node, + corner_t& corner, + mesh_input_t& mesh_inps, + int num_dims) + { + FILE* in; + char ch; + + size_t num_nodes_in_elem = 1; + for (int dim = 0; dim < num_dims; dim++) { + num_nodes_in_elem *= 2; + } + + // read the mesh WARNING: assumes a .geo file + in = fopen(mesh_file_, "r"); + + // skip 8 lines + for (int j = 1; j <= 8; j++) { + int i = 0; + while ((ch = (char)fgetc(in)) != '\n') { + i++; + } + } + + // --- Read in the nodes in the mesh --- + + size_t num_nodes = 0; + + fscanf(in, "%lu", &num_nodes); + printf("Number of nodes read in %lu\n", num_nodes); + + + mesh.initialize_nodes(num_nodes); + + // initialize node state variables, for now, we just need coordinates, the rest will be initialize by the respective solvers + std::vector required_node_state = { node_state::coords }; + node.initialize(num_nodes, num_dims, required_node_state); + + // read the initial mesh coordinates + // x-coords + for (int node_id = 0; node_id < mesh.num_nodes; node_id++) { + fscanf(in, "%le", &node.coords.host(node_id, 0)); + node.coords.host(node_id, 0)*= mesh_inps.scale_x; + } + + // y-coords + for (int node_id = 0; node_id < mesh.num_nodes; node_id++) { + fscanf(in, "%le", &node.coords.host(node_id, 1)); + node.coords.host(node_id, 1)*= mesh_inps.scale_y; + } + + // z-coords + for (int node_id = 0; node_id < mesh.num_nodes; node_id++) { + if (num_dims == 3) { + fscanf(in, "%le", &node.coords.host(node_id, 2)); + node.coords.host(node_id, 2)*= mesh_inps.scale_z; + } + else{ + double dummy; + fscanf(in, "%le", &dummy); + } + } // end for + + + // Update device nodal positions + node.coords.update_device(); + + ch = (char)fgetc(in); + + // skip 1 line + for (int j = 1; j <= 1; j++) { + int i = 0; + while ((ch = (char)fgetc(in)) != '\n') { + i++; + } + } + + // --- read in the elements in the mesh --- + size_t num_elem = 0; + + fscanf(in, "%lu", &num_elem); + printf("Number of elements read in %lu\n", num_elem); + + // initialize elem variables + mesh.initialize_elems(num_elem, num_dims); + // GaussPoints.initialize(num_elem, 3); // always 3D here, even for 2D + + + // for each cell read the list of associated nodes + for (int elem_gid = 0; elem_gid < num_elem; elem_gid++) { + for (int node_lid = 0; node_lid < num_nodes_in_elem; node_lid++) { + fscanf(in, "%lu", &mesh.nodes_in_elem.host(elem_gid, node_lid)); // %d vs zu + + // shift to start node index space at 0 + mesh.nodes_in_elem.host(elem_gid, node_lid) -= 1; + } + } + + // Convert from ensight to IJK mesh + int convert_ensight_to_ijk[8]; + convert_ensight_to_ijk[0] = 0; + convert_ensight_to_ijk[1] = 1; + convert_ensight_to_ijk[2] = 3; + convert_ensight_to_ijk[3] = 2; + convert_ensight_to_ijk[4] = 4; + convert_ensight_to_ijk[5] = 5; + convert_ensight_to_ijk[6] = 7; + convert_ensight_to_ijk[7] = 6; + + int tmp_ijk_indx[8]; + + for (int elem_gid = 0; elem_gid < num_elem; elem_gid++) { + for (int node_lid = 0; node_lid < num_nodes_in_elem; node_lid++) { + tmp_ijk_indx[node_lid] = mesh.nodes_in_elem.host(elem_gid, convert_ensight_to_ijk[node_lid]); + } + + for (int node_lid = 0; node_lid < num_nodes_in_elem; node_lid++){ + mesh.nodes_in_elem.host(elem_gid, node_lid) = tmp_ijk_indx[node_lid]; + } + } + // update device side + mesh.nodes_in_elem.update_device(); + + // initialize corner variables + int num_corners = num_elem * mesh.num_nodes_in_elem; + mesh.initialize_corners(num_corners); + // corner.initialize(num_corners, num_dims); + + // Close mesh input file + fclose(in); + + // Build connectivity + mesh.build_connectivity(); + + return; + } // end read ensight mesh + + ///////////////////////////////////////////////////////////////////////////// + /// + /// \fn read_Abaqus_mesh + /// + /// \brief Read .inp mesh file + /// + /// \param Simulation mesh + /// \param Simulation state + /// \param Node state struct + /// \param Number of dimensions + /// + ///////////////////////////////////////////////////////////////////////////// + void read_Abaqus_mesh(Mesh_t& mesh, + State_t& State, + int num_dims) + { + + std::cout<<"Reading abaqus input file for mesh"< nodes; + std::vector elements; + + std::string line; + bool readingNodes = false; + bool readingElements = false; + + while (std::getline(inputFile, line)) { + if (line.find("*Node") != std::string::npos) { + readingNodes = true; + std::cout<<"Found *Node"<> node.id && std::getline(iss, token, ',') && iss >> node.x && + std::getline(iss, token, ',') && iss >> node.y && + std::getline(iss, token, ',') && iss >> node.z)) { + std::cerr << "Failed to parse line: " << line << std::endl; + continue; // Skip this line if parsing failed + } + nodes.push_back(node); + } + + if (line.find("*Element") != std::string::npos) { + readingElements = true; + std::cout<<"Found *Element*"<> element.id)){ + std::cout << "Failed to parse line: " << line << std::endl; + continue; // Skip this line if parsing failed + } + + while ((std::getline(iss, token, ','))) { + // Now extract the integer, ignoring any trailing whitespace + int val; + iss >> val; + element.connectivity.push_back(val); + } + + // Convert from abaqus to IJK mesh + int convert_abq_to_ijk[8]; + convert_abq_to_ijk[0] = 0; + convert_abq_to_ijk[1] = 1; + convert_abq_to_ijk[2] = 3; + convert_abq_to_ijk[3] = 2; + convert_abq_to_ijk[4] = 4; + convert_abq_to_ijk[5] = 5; + convert_abq_to_ijk[6] = 7; + convert_abq_to_ijk[7] = 6; + + int tmp_ijk_indx[8]; + + for (int node_lid = 0; node_lid < 8; node_lid++) { + tmp_ijk_indx[node_lid] = element.connectivity[convert_abq_to_ijk[node_lid]]; + } + + for (int node_lid = 0; node_lid < 8; node_lid++){ + element.connectivity[node_lid] = tmp_ijk_indx[node_lid]; + } + + elements.push_back(element); + } + } + + inputFile.close(); + + size_t num_nodes = nodes.size(); + + printf("Number of nodes read in %lu\n", num_nodes); + + // initialize node variables + mesh.initialize_nodes(num_nodes); + + // initialize node state, for now, we just need coordinates, the rest will be initialize by the respective solvers + std::vector required_node_state = { node_state::coords }; + + State.node.initialize(num_nodes, num_dims, required_node_state); + + + // Copy nodes to mesh + for(int node_gid = 0; node_gid < num_nodes; node_gid++){ + State.node.coords.host(node_gid, 0) = nodes[node_gid].x; + State.node.coords.host(node_gid, 1) = nodes[node_gid].y; + State.node.coords.host(node_gid, 2) = nodes[node_gid].z; + } + + // Update device nodal positions + State.node.coords.update_device(); + + + // --- read in the elements in the mesh --- + size_t num_elem = elements.size(); + printf("Number of elements read in %lu\n", num_elem); + + // initialize elem variables + mesh.initialize_elems(num_elem, num_dims); + + + // for each cell read the list of associated nodes + for (int elem_gid = 0; elem_gid < num_elem; elem_gid++) { + for (int node_lid = 0; node_lid < 8; node_lid++) { + mesh.nodes_in_elem.host(elem_gid, node_lid) = elements[elem_gid].connectivity[node_lid]; + + // shift to start node index space at 0 + mesh.nodes_in_elem.host(elem_gid, node_lid) -= 1; + } + } + + // update device side + mesh.nodes_in_elem.update_device(); + + // initialize corner variables + int num_corners = num_elem * mesh.num_nodes_in_elem; + mesh.initialize_corners(num_corners); + // State.corner.initialize(num_corners, num_dims); + + // Build connectivity + mesh.build_connectivity(); + } // end read abaqus mesh + + + ///////////////////////////////////////////////////////////////////////////// + /// + /// \fn read_vtk_mesh + /// + /// \brief Read ASCII .vtk mesh file + /// + /// \param Simulation mesh + /// \param Simulation state + /// \param Node state struct + /// \param Number of dimensions + /// + ///////////////////////////////////////////////////////////////////////////// + void read_vtk_mesh(Mesh_t& mesh, + GaussPoint_t& GaussPoints, + node_t& node, + corner_t& corner, + mesh_input_t& mesh_inps, + int num_dims) + { + + std::cout<<"Reading VTK mesh"< v = split (str, delimiter); + + // looking for the following text: + // POINTS %d float + if(v[0] == "POINTS"){ + size_t num_nodes = std::stoi(v[1]); + printf("Number of nodes read in %zu\n", num_nodes); + mesh.initialize_nodes(num_nodes); + + std::vector required_node_state = { node_state::coords }; + node.initialize(num_nodes, num_dims, required_node_state); + + found=true; + } // end if + + + if (i>1000){ + std::cerr << "ERROR: Failed to find POINTS in file" << std::endl; + break; + } // end if + + i++; + } // end while + + // read the node coordinates + for (node_gid=0; node_gid v = split (str, delimiter); + + // save the nodal coordinates + node.coords.host(node_gid, 0) = mesh_inps.scale_x*std::stod(v[0]); // double + node.coords.host(node_gid, 1) = mesh_inps.scale_y*std::stod(v[1]); // double + if(num_dims==3){ + node.coords.host(node_gid, 2) = mesh_inps.scale_z*std::stod(v[2]); // double + } + + } // end for nodes + + + // Update device nodal positions + node.coords.update_device(); + + + found=false; + + // look for CELLS + i = 0; + size_t num_elem = 0; + while (found==false) { + std::string str; + std::getline(in, str); + + std::string delimiter = " "; + std::vector v = split (str, delimiter); + std::cout << v[0] << std::endl; // printing + + // looking for the following text: + // CELLS num_elem size + if(v[0] == "CELLS"){ + num_elem = std::stoi(v[1]); + printf("Number of elements read in %zu\n", num_elem); + + // initialize elem variables + mesh.initialize_elems(num_elem, num_dims); + + found=true; + } // end if + + + if (i>1000){ + printf("ERROR: Failed to find CELLS \n"); + break; + } // end if + + i++; + } // end while + + + // read the node ids in the element + for (elem_gid=0; elem_gid v = split (str, delimiter); + num_nodes_in_elem = std::stoi(v[0]); + + for (size_t node_lid=0; node_lid v = split (str, delimiter); + + // looking for the following text: + // CELLS num_elem size + if(v[0] == "CELL_TYPES"){ + + std::getline(in, str); + elem_type = std::stoi(str); + + found=true; + } // end if + + + if (i>1000){ + printf("ERROR: Failed to find elem_TYPE \n"); + break; + } // end if + + i++; + } // end while + printf("Element type = %zu \n", elem_type); + // elem types: + // linear hex = 12, linear quad = 9 + found=false; + + + if(num_nodes_in_elem==8 & elem_type != 12) { + printf("Wrong element type of %zu \n", elem_type); + std::cerr << "ERROR: incorrect element type in VTK file" << std::endl; + } + + in.close(); + + } // end of VTKread function + + + ///////////////////////////////////////////////////////////////////////////// + /// + /// \fn read_vtu_mesh + /// + /// \brief Read ASCII .vtu mesh file + /// + /// \param Simulation mesh + /// \param Simulation state + /// \param Node state struct + /// \param Number of dimensions + /// + ///////////////////////////////////////////////////////////////////////////// + void read_vtu_mesh(Mesh_t& mesh, + GaussPoint_t& GaussPoints, + node_t& node, + corner_t& corner, + mesh_input_t& mesh_inps, + int num_dims) + { + + std::cout<<"Reading VTU file in a multiblock VTK mesh"< required_node_state = { node_state::coords }; + node.initialize(num_nodes, num_dims, required_node_state); + + //------------------------------------ + // allocate the elem object id array + mesh_inps.object_ids = DCArrayKokkos (num_elems, "ObjectIDs"); + + + // ------------------------ + // Mesh file storage order: + // objectId + // Points + // connectivity + // offsets + // types + // ------------------------ + + // temporary arrays + DCArrayKokkos node_coords(num_nodes,3, "node_coords_vtu_file"); // always 3 with vtu files + DCArrayKokkos connectivity(num_elems,num_nodes_in_elem, "connectivity_vtu_file"); + DCArrayKokkos elem_types(num_elems, "elem_types_vtu_file"); // element types + + + // for all fields, we stop recording when we get to "<" + std::string stop = "<"; + + // the size of 1D storage from reading the mesh file + size_t size; + + // --- + // Object ids + // --- + + // the object id in the element + // array dims are (num_elems) + found = extract_values_xml(mesh_inps.object_ids.host.pointer(), + "\"ObjectId\"", + stop, + in, + size); + if(found==false){ + throw std::runtime_error("ERROR: ObjectIDs were not found in the XML file!"); + //std::cout << "ERROR: ObjectIDs were not found in the XML file!" << std::endl; + } + mesh_inps.object_ids.update_device(); + + + // --- + // Nodal coordinates of mesh + // --- + + // coordinates of the node + // array dims are (num_nodes,dims) + // must use the quotes around Points to read the point values + found = extract_values_xml(node_coords.host.pointer(), + "\"Points\"", + stop, + in, + size); + if(found==false){ + throw std::runtime_error("**** ERROR: mesh nodes were not found in the XML file! ****"); + //std::cout << "ERROR: mesh nodes were not found in the XML file!" << std::endl; + } + if (size!=num_nodes*3){ + throw std::runtime_error("ERROR: failed to read all the mesh nodes!"); + //std::cout << "ERROR: failed to read all the mesh nodes!" << std::endl; + } + node_coords.update_device(); + + // dimensional scaling of the mesh + const double scl_x = mesh_inps.scale_x; + const double scl_y = mesh_inps.scale_y; + const double scl_z = mesh_inps.scale_z; + + // save the node coordinates to the state array + FOR_ALL(node_gid, 0, mesh.num_nodes, { + + // save the nodal coordinates + node.coords(node_gid, 0) = scl_x*node_coords(node_gid, 0); // double + node.coords(node_gid, 1) = scl_y*node_coords(node_gid, 1); // double + if(num_dims==3){ + node.coords(node_gid, 2) = scl_z*node_coords(node_gid, 2); // double + } + + }); // end for parallel nodes + node.coords.update_host(); + + + // --- + // Nodes in the element + // --- + + // fill temporary nodes in the element array + // array dims are (num_elems,num_nodes_in_elem) + found = extract_values_xml(connectivity.host.pointer(), + "\"connectivity\"", + stop, + in, + size); + if(found==false){ + std::cout << "ERROR: mesh connectivity was not found in the XML file!" << std::endl; + } + connectivity.update_device(); + + // array dims are the (num_elems) + // 8 = pixal i,j,k linear quad format + // 9 = linear quad ensight ordering + // 12 = linear ensight hex ordering + // 72 = VTK_LAGRANGE_HEXAHEDRON + // .... + found = extract_values_xml(elem_types.host.pointer(), + "\"types\"", + stop, + in, + size); + if(found==false){ + std::cout << "ERROR: element types were not found in the XML file!" << std::endl; + } + elem_types.update_device(); + + // check that the element type is supported by Fierro + FOR_ALL (elem_gid, 0, mesh.num_elems, { + if(elem_types(elem_gid) == element_types::linear_quad || + elem_types(elem_gid) == element_types::linear_hex_ijk || + elem_types(elem_gid) == element_types::linear_hex || + elem_types(elem_gid) == element_types::arbitrary_hex ) + { + // at least one of them is true + } + else + { + // unknown element used + Kokkos::abort("Unknown element type in the mesh \n"); + } + }); + + // Convert from ensight linear hex to a IJK mesh + CArrayKokkos convert_ensight_to_ijk(8, "convert_ensight_to_ijk"); + + // Convert the arbitrary order hex to a IJK mesh + DCArrayKokkos convert_pn_vtk_to_ijk(mesh.num_nodes_in_elem, "convert_pn_vtk_to_ijk"); + + //build the connectivity for element type 12 + // elem_types.host(0) + switch(elem_types.host(0)){ + + case element_types::linear_quad: + // the node order is correct, no changes required + + FOR_ALL (elem_gid, 0, mesh.num_elems, { + + for (size_t node_lid=0; node_lid origin(num_dim); + // SimulationParamaters.mesh_input.origin.update_host(); + for (int i = 0; i < num_dim; i++) { origin[i] = SimulationParamaters.mesh_input.origin[i]; } + + // --- 2D parameters --- + // const int num_faces_in_elem = 4; // number of faces in elem + // const int num_points_in_elem = 4; // number of points in elem + // const int num_points_in_face = 2; // number of points in a face + // const int num_edges_in_elem = 4; // number of edges in a elem + + // --- mesh node ordering --- + // Convert ijk index system to the finite element numbering convention + // for vertices in elem + auto convert_point_number_in_quad = CArray(4); + convert_point_number_in_quad(0) = 0; + convert_point_number_in_quad(1) = 1; + convert_point_number_in_quad(2) = 3; + convert_point_number_in_quad(3) = 2; + + // intialize node variables + mesh.initialize_nodes(num_nodes); + + // initialize node state, for now, we just need coordinates, the rest will be initialize by the respective solvers + std::vector required_node_state = { node_state::coords }; + node.initialize(num_nodes, num_dim, required_node_state); + + // --- Build nodes --- + + // populate the point data structures + for (int j = 0; j < num_points_j; j++) { + for (int i = 0; i < num_points_i; i++) { + // global id for the point + int node_gid = get_id(i, j, 0, num_points_i, num_points_j); + + // store the point coordinates + node.coords.host(node_gid, 0) = origin[0] + (double)i * dx; + node.coords.host(node_gid, 1) = origin[1] + (double)j * dy; + } // end for i + } // end for j + + + node.coords.update_device(); + + // initialize elem variables + mesh.initialize_elems(num_elems, num_dim); + + // populate the elem center data structures + for (int j = 0; j < num_elems_j; j++) { + for (int i = 0; i < num_elems_i; i++) { + // global id for the elem + int elem_gid = get_id(i, j, 0, num_elems_i, num_elems_j); + + // store the point IDs for this elem where the range is + // (i:i+1, j:j+1 for a linear quad + int this_point = 0; + + for (int jcount = j; jcount <= j + 1; jcount++) { + for (int icount = i; icount <= i + 1; icount++) { + // global id for the points + int node_gid = get_id(icount, jcount, 0, num_points_i, num_points_j); + + // convert this_point index to the FE index convention + int this_index = convert_point_number_in_quad(this_point); + + // store the points in this elem according the the finite + // element numbering convention + mesh.nodes_in_elem.host(elem_gid, this_index) = node_gid; + + // increment the point counting index + this_point = this_point + 1; + } // end for icount + } // end for jcount + } // end for i + } // end for j + + // update device side + mesh.nodes_in_elem.update_device(); + + // intialize corner variables + int num_corners = num_elems * mesh.num_nodes_in_elem; + mesh.initialize_corners(num_corners); + // corner.initialize(num_corners, num_dim); + + // Build connectivity + mesh.build_connectivity(); + } // end build_2d_box + + ///////////////////////////////////////////////////////////////////////////// + /// + /// \fn build_2d_polar + /// + /// \brief Builds an unstructured 2D polar mesh + /// + /// \param Simulation mesh that is built + /// \param Element state data + /// \param Node state data + /// \param Corner state data + /// \param Simulation parameters + /// + ///////////////////////////////////////////////////////////////////////////// + void build_2d_polar(Mesh_t& mesh, + GaussPoint_t& GaussPoints, + node_t& node, + corner_t& corner, + SimulationParameters_t& SimulationParamaters) const + { + printf("Creating a 2D polar mesh \n"); + + int num_dim = 2; + + const double inner_radius = SimulationParamaters.mesh_input.inner_radius; + const double outer_radius = SimulationParamaters.mesh_input.outer_radius; + + const double start_angle = PI / 180.0 * SimulationParamaters.mesh_input.starting_angle; + const double end_angle = PI / 180.0 * SimulationParamaters.mesh_input.ending_angle; + + const int num_elems_i = SimulationParamaters.mesh_input.num_radial_elems; + const int num_elems_j = SimulationParamaters.mesh_input.num_angular_elems; + + const int num_points_i = num_elems_i + 1; // num points in x + const int num_points_j = num_elems_j + 1; // num points in y + + const int num_nodes = num_points_i * num_points_j; + + const double dx = (outer_radius - inner_radius) / ((double)num_elems_i); // len/(elems) + const double dy = (end_angle - start_angle) / ((double)num_elems_j); // len/(elems) + + const int num_elems = num_elems_i * num_elems_j; + + std::vector origin(num_dim); + + for (int i = 0; i < num_dim; i++) { origin[i] = SimulationParamaters.mesh_input.origin[i]; } + + // --- 2D parameters --- + // const int num_faces_in_elem = 4; // number of faces in elem + // const int num_points_in_elem = 4; // number of points in elem + // const int num_points_in_face = 2; // number of points in a face + // const int num_edges_in_elem = 4; // number of edges in a elem + + // --- mesh node ordering --- + // Convert ijk index system to the finite element numbering convention + // for vertices in elem + auto convert_point_number_in_quad = CArray(4); + convert_point_number_in_quad(0) = 0; + convert_point_number_in_quad(1) = 1; + convert_point_number_in_quad(2) = 3; + convert_point_number_in_quad(3) = 2; + + // intialize node variables + mesh.initialize_nodes(num_nodes); + + // initialize node state, for now, we just need coordinates, the rest will be initialize by the respective solvers + std::vector required_node_state = { node_state::coords }; + node.initialize(num_nodes, num_dim, required_node_state); + + // populate the point data structures + for (int j = 0; j < num_points_j; j++) { + for (int i = 0; i < num_points_i; i++) { + // global id for the point + int node_gid = get_id(i, j, 0, num_points_i, num_points_j); + + double r_i = inner_radius + (double)i * dx; + double theta_j = start_angle + (double)j * dy; + + // store the point coordinates + node.coords.host(node_gid, 0) = origin[0] + r_i * cos(theta_j); + node.coords.host(node_gid, 1) = origin[1] + r_i * sin(theta_j); + + if(node.coords.host(node_gid, 0) < 0.0){ + throw std::runtime_error("**** NODE RADIUS FOR RZ MESH MUST BE POSITIVE ****"); + } + + } // end for i + } // end for j + + + node.coords.update_device(); + + // initialize elem variables + mesh.initialize_elems(num_elems, num_dim); + + // populate the elem center data structures + for (int j = 0; j < num_elems_j; j++) { + for (int i = 0; i < num_elems_i; i++) { + // global id for the elem + int elem_gid = get_id(i, j, 0, num_elems_i, num_elems_j); + + // store the point IDs for this elem where the range is + // (i:i+1, j:j+1 for a linear quad + int this_point = 0; + + for (int jcount = j; jcount <= j + 1; jcount++) { + for (int icount = i; icount <= i + 1; icount++) { + // global id for the points + int node_gid = get_id(icount, jcount, 0, num_points_i, num_points_j); + + // convert this_point index to the FE index convention + int this_index = convert_point_number_in_quad(this_point); + + // store the points in this elem according the the finite + // element numbering convention + mesh.nodes_in_elem.host(elem_gid, this_index) = node_gid; + + // increment the point counting index + this_point = this_point + 1; + } // end for icount + } // end for jcount + } // end for i + } // end for j + + // update device side + mesh.nodes_in_elem.update_device(); + + // intialize corner variables + int num_corners = num_elems * mesh.num_nodes_in_elem; + mesh.initialize_corners(num_corners); + // corner.initialize(num_corners, num_dim); + + // Build connectivity + mesh.build_connectivity(); + } // end build_2d_box + + ///////////////////////////////////////////////////////////////////////////// + /// + /// \fn build_3d_box + /// + /// \brief Builds an unstructured 3D rectilinear mesh + /// + /// \param Simulation mesh that is built + /// \param Element state data + /// \param Node state data + /// \param Corner state data + /// \param Simulation parameters + /// + ///////////////////////////////////////////////////////////////////////////// + void build_3d_box(Mesh_t& mesh, + GaussPoint_t& GaussPoints, + node_t& node, + corner_t& corner, + SimulationParameters_t& SimulationParamaters) const + { + printf("Creating a 3D box mesh \n"); + + const int num_dim = 3; + + // SimulationParamaters.mesh_input.length.update_host(); + const double lx = SimulationParamaters.mesh_input.length[0]; + const double ly = SimulationParamaters.mesh_input.length[1]; + const double lz = SimulationParamaters.mesh_input.length[2]; + + // SimulationParamaters.mesh_input.num_elems.update_host(); + const int num_elems_i = SimulationParamaters.mesh_input.num_elems[0]; + const int num_elems_j = SimulationParamaters.mesh_input.num_elems[1]; + const int num_elems_k = SimulationParamaters.mesh_input.num_elems[2]; + + const int num_points_i = num_elems_i + 1; // num points in x + const int num_points_j = num_elems_j + 1; // num points in y + const int num_points_k = num_elems_k + 1; // num points in y + + const int num_nodes = num_points_i * num_points_j * num_points_k; + + const double dx = lx / ((double)num_elems_i); // len/(num_elems_i) + const double dy = ly / ((double)num_elems_j); // len/(num_elems_j) + const double dz = lz / ((double)num_elems_k); // len/(num_elems_k) + + const int num_elems = num_elems_i * num_elems_j * num_elems_k; + + std::vector origin(num_dim); + // SimulationParamaters.mesh_input.origin.update_host(); + for (int i = 0; i < num_dim; i++) { origin[i] = SimulationParamaters.mesh_input.origin[i]; } + + // --- 3D parameters --- + // const int num_faces_in_elem = 6; // number of faces in elem + // const int num_points_in_elem = 8; // number of points in elem + // const int num_points_in_face = 4; // number of points in a face + // const int num_edges_in_elem = 12; // number of edges in a elem + + + // initialize mesh node variables + mesh.initialize_nodes(num_nodes); + + // initialize node state variables, for now, we just need coordinates, the rest will be initialize by the respective solvers + std::vector required_node_state = { node_state::coords }; + node.initialize(num_nodes, num_dim, required_node_state); + + // --- Build nodes --- + + // populate the point data structures + for (int k = 0; k < num_points_k; k++) { + for (int j = 0; j < num_points_j; j++) { + for (int i = 0; i < num_points_i; i++) { + // global id for the point + int node_gid = get_id(i, j, k, num_points_i, num_points_j); + + // store the point coordinates + node.coords.host(node_gid, 0) = origin[0] + (double)i * dx; + node.coords.host(node_gid, 1) = origin[1] + (double)j * dy; + node.coords.host(node_gid, 2) = origin[2] + (double)k * dz; + } // end for i + } // end for j + } // end for k + + + node.coords.update_device(); + + // initialize elem variables + mesh.initialize_elems(num_elems, num_dim); + + // --- Build elems --- + + // populate the elem center data structures + for (int k = 0; k < num_elems_k; k++) { + for (int j = 0; j < num_elems_j; j++) { + for (int i = 0; i < num_elems_i; i++) { + // global id for the elem + int elem_gid = get_id(i, j, k, num_elems_i, num_elems_j); + + // store the point IDs for this elem where the range is + // (i:i+1, j:j+1, k:k+1) for a linear hexahedron + int this_point = 0; + for (int kcount = k; kcount <= k + 1; kcount++) { + for (int jcount = j; jcount <= j + 1; jcount++) { + for (int icount = i; icount <= i + 1; icount++) { + // global id for the points + int node_gid = get_id(icount, jcount, kcount, + num_points_i, num_points_j); + + // convert this_point index to the FE index convention + int this_index = this_point; //convert_point_number_in_Hex(this_point); + + // store the points in this elem according the the finite + // element numbering convention + mesh.nodes_in_elem.host(elem_gid, this_index) = node_gid; + + // increment the point counting index + this_point = this_point + 1; + } // end for icount + } // end for jcount + } // end for kcount + } // end for i + } // end for j + } // end for k + + // update device side + mesh.nodes_in_elem.update_device(); + + // initialize corner variables + int num_corners = num_elems * mesh.num_nodes_in_elem; + mesh.initialize_corners(num_corners); + // corner.initialize(num_corners, num_dim); + + // Build connectivity + mesh.build_connectivity(); + } // end build_3d_box + + ///////////////////////////////////////////////////////////////////////////// + /// + /// \fn build_3d_HexN_box + /// + /// \brief Builds an unstructured high order 3D rectilinear mesh + /// + /// \param Simulation mesh that is built + /// \param Element state data + /// \param Node state data + /// \param Corner state data + /// \param Simulation parameters + /// + ///////////////////////////////////////////////////////////////////////////// + void build_3d_HexN_box(Mesh_t& mesh, + GaussPoint_t& GaussPoints, + node_t& node, + corner_t& corner, + SimulationParameters_t& SimulationParamaters) const + { + printf(" ***** WARNING:: build_3d_HexN_box not yet implemented\n"); + const int num_dim = 3; + + // SimulationParamaters.mesh_input.length.update_host(); + const double lx = SimulationParamaters.mesh_input.length[0]; + const double ly = SimulationParamaters.mesh_input.length[1]; + const double lz = SimulationParamaters.mesh_input.length[2]; + + // SimulationParamaters.mesh_input.num_elems.update_host(); + const int num_elems_i = SimulationParamaters.mesh_input.num_elems[0]; + const int num_elems_j = SimulationParamaters.mesh_input.num_elems[1]; + const int num_elems_k = SimulationParamaters.mesh_input.num_elems[2]; + + // creating zones for the Pn order + const int Pn_order = SimulationParamaters.mesh_input.p_order; + + if (Pn_order > 19) { + printf("Fierro DG and RD solvers are only valid for elements up to Pn = 19 \n"); + return; + } + + const int num_zones_i = Pn_order*num_elems_i; + const int num_zones_j = Pn_order*num_elems_j; + const int num_zones_k = Pn_order*num_elems_k; + + const int num_points_i = num_zones_i+1; // num points in x accounting for Pn + const int num_points_j = num_zones_j+1; // num points in y accounting for Pn + const int num_points_k = num_zones_k+1; // num points in y accounting for Pn + + + const double dx = lx/((double)num_zones_i); // len/(num_zones_i) + const double dy = ly/((double)num_zones_j); // len/(num_zones_j) + const double dz = lz/((double)num_zones_k); // len/(num_zones_k) + + const int num_elems = num_elems_i*num_elems_j*num_elems_k; + // const int num_zones = num_zones_i*num_zones_j*num_zones_k; // accounts for Pn + + std::vector origin(num_dim); + for (int i = 0; i < num_dim; i++) { origin[i] = SimulationParamaters.mesh_input.origin[i]; } + + // --- 3D parameters --- + // const int num_faces_in_zone = 6; // number of faces in zone + // const int num_points_in_zone = 8; // number of points in zone + // const int num_points_in_face = 4; // number of points in a face + + // p_order = 1, 2, 3, 4, 5 + // num_nodes = 2, 3, 4, 5, 6 + const int num_1D_points = Pn_order+1; + const int num_points_in_elem = num_1D_points*num_1D_points*num_1D_points; + + + // --- elem --- + auto elem_coords = CArray (num_elems, num_dim); + auto elem_point_list = CArray (num_elems, num_points_in_elem); + + + // --- point --- + int num_points = num_points_i * num_points_j * num_points_k; + auto pt_coords = CArray (num_points, num_dim); + + + // --- Build nodes --- + + // initialize node variables + mesh.initialize_nodes(num_points); + + // + std::vector required_node_state = { node_state::coords }; + node.initialize(num_points, num_dim, required_node_state); + // populate the point data structures + for (int k = 0; k < num_points_k; k++){ + for (int j = 0; j < num_points_j; j++){ + for (int i = 0; i < num_points_i; i++){ + + + // global id for the point + int node_gid = get_id(i, j, k, num_points_i, num_points_j); + + // store the point coordinates + node.coords.host(node_gid, 0) = origin[0] + (double)i * dx; + node.coords.host(node_gid, 1) = origin[1] + (double)j * dy; + node.coords.host(node_gid, 2) = origin[2] + (double)k * dz; + + } // end for k + } // end for i + } // end for j + + + node.coords.update_device(); + + + // initialize elem variables + mesh.initialize_elems(num_elems, num_dim); + + // --- Build elems --- + + // populate the elem center data structures accounting for Pn + for (int k=0; k graphics_times, + std::vector node_states, + std::vector gauss_pt_states, + std::vector material_pt_states, + const size_t solver_id) + { + + + // node_state is an enum for possible fields (e.g., coords, velocity, etc.), see state.h + // gauss_pt_state is an enum for possible fields (e.g., vol, divergence, etc.) + // material_pt_state is an enum for possible fields (e.g., den, pres, etc.) + + + // ******************* + // Update host + // ******************* + + const size_t num_mats = State.MaterialPoints.num_material_points.size(); + + // material point values + + // Update host data for mat_pt state + for (auto field : material_pt_states){ + switch(field){ + // scalar vars to write out + case material_pt_state::density: + State.MaterialPoints.den.update_host(); + break; + case material_pt_state::pressure: + State.MaterialPoints.pres.update_host(); + break; + case material_pt_state::specific_internal_energy: + State.MaterialPoints.sie.update_host(); + break; + case material_pt_state::sound_speed: + State.MaterialPoints.sspd.update_host(); + break; + case material_pt_state::mass: + State.MaterialPoints.mass.update_host(); + break; + case material_pt_state::volume_fraction: + State.MaterialPoints.volfrac.update_host(); + State.MaterialPoints.geo_volfrac.update_host(); + break; + case material_pt_state::eroded_flag: + State.MaterialPoints.eroded.update_host(); + break; + // tensor vars to write out + case material_pt_state::stress: + State.MaterialPoints.stress.update_host(); + break; + + // additional vars for thermal-mechanical solver + case material_pt_state::thermal_conductivity: + State.MaterialPoints.conductivity.update_host(); + break; + + case material_pt_state::specific_heat: + State.MaterialPoints.specific_heat.update_host(); + break; + + // add other variables here + + // not used + case material_pt_state::elastic_modulii: + break; + case material_pt_state::shear_modulii: + break; + case material_pt_state::poisson_ratios: + break; + case material_pt_state::heat_flux: + break; + default: + std::cout<<"Desired material point state not understood in outputs"< elem_scalar_var_names(num_elem_scalar_vars); + std::vector elem_tensor_var_names(num_elem_tensor_vars); + + // Scalar, vector, and tensor values associated with a material in part elems + std::vector mat_elem_scalar_var_names(num_mat_pt_scalar_vars); + std::vector mat_elem_tensor_var_names(num_mat_pt_tensor_vars); + + + // the ids to access a variable in the mat_scalar_var_name or tensor list + int mat_den_id = -1; + int mat_pres_id = -1; + int mat_sie_id = -1; + int mat_sspd_id = -1; + int mat_mass_id = -1; + int mat_volfrac_id = -1; + int mat_geo_volfrac_id = -1; // geometric volume fraction of part + int mat_eroded_id = -1; + int mat_stress_id = -1; + + int mat_conductivity_id = -1; + int mat_specific_heat_id = -1; + + // the index for the scalar, vector, and tensor fields + size_t var = 0; + size_t vector_var = 0; + size_t tensor_var = 0; + + // material point state to output + for (auto field : SimulationParamaters.output_options.output_mat_pt_state){ + switch(field){ + // scalar vars + case material_pt_state::density: + mat_elem_scalar_var_names[var] = "mat_den"; + mat_den_id = var; + var++; + break; + case material_pt_state::pressure: + mat_elem_scalar_var_names[var] = "mat_pres"; + mat_pres_id = var; + var++; + break; + case material_pt_state::specific_internal_energy: + mat_elem_scalar_var_names[var] = "mat_sie"; + mat_sie_id = var; + var++; + break; + case material_pt_state::sound_speed: + mat_elem_scalar_var_names[var] = "mat_sspd"; + mat_sspd_id = var; + var++; + break; + case material_pt_state::mass: + mat_elem_scalar_var_names[var] = "mat_mass"; + mat_mass_id = var; + var++; + break; + case material_pt_state::volume_fraction: + mat_elem_scalar_var_names[var] = "mat_volfrac"; + mat_volfrac_id = var; + var++; + + mat_elem_scalar_var_names[var] = "mat_geo_volfrac"; + mat_geo_volfrac_id = var; + var++; + break; + case material_pt_state::eroded_flag: + mat_elem_scalar_var_names[var] = "mat_eroded"; + mat_eroded_id = var; + var++; + break; + // tensor vars + case material_pt_state::stress: + mat_elem_tensor_var_names[tensor_var] = "mat_stress"; + mat_stress_id = tensor_var; + tensor_var++; + break; + + + // additional vars for thermal-mechanical solver + case material_pt_state::thermal_conductivity: + mat_elem_scalar_var_names[var] = "mat_thermal_K"; + mat_conductivity_id = var; + var++; + break; + + case material_pt_state::specific_heat: + mat_elem_scalar_var_names[var] = "mat_Cp"; + mat_specific_heat_id = var; + var++; + break; + + + // add other variables here + + // not used + case material_pt_state::elastic_modulii: + break; + case material_pt_state::shear_modulii: + break; + case material_pt_state::poisson_ratios: + break; + case material_pt_state::heat_flux: + break; + } // end switch + } // end for over mat_pt_states + + + // element average fields to output + + // the ids to access a variable in the elem_scalar_var_name or tensor list + int den_id = -1; + int pres_id = -1; + int sie_id = -1; + int sspd_id = -1; + int mass_id = -1; + int stress_id = -1; + + int conductivity_id = -1; + int specific_heat_id = -1; + + // reset the counters + var = 0; + vector_var = 0; + tensor_var = 0; + + // element state to output + for (auto field : SimulationParamaters.output_options.output_elem_state){ + switch(field){ + // scalar vars + case material_pt_state::density: + elem_scalar_var_names[var] = "den"; + den_id = var; + var++; + break; + case material_pt_state::pressure: + elem_scalar_var_names[var] = "pres"; + pres_id = var; + var++; + break; + case material_pt_state::specific_internal_energy: + elem_scalar_var_names[var] = "sie"; + sie_id = var; + var++; + break; + case material_pt_state::sound_speed: + elem_scalar_var_names[var] = "sspd"; + sspd_id = var; + var++; + break; + case material_pt_state::mass: + elem_scalar_var_names[var] = "mass"; + mass_id = var; + var++; + break; + // tensor vars + case material_pt_state::stress: + elem_tensor_var_names[tensor_var] = "stress"; + stress_id = tensor_var; + tensor_var++; + break; + + // heat transfer variables + case material_pt_state::thermal_conductivity: + elem_scalar_var_names[var] = "thermal_K"; + conductivity_id = var; + var++; + break; + + case material_pt_state::specific_heat: + elem_scalar_var_names[var] = "Cp"; + specific_heat_id = var; + var++; + break; + + // add other variables here + + // not used + case material_pt_state::volume_fraction: + break; + case material_pt_state::eroded_flag: + break; + case material_pt_state::elastic_modulii: + break; + case material_pt_state::shear_modulii: + break; + case material_pt_state::poisson_ratios: + break; + case material_pt_state::heat_flux: + break; + } // end switch + } // end for over mat_pt_states + + // append Gauss point vars to the element arrays + int vol_id = -1; + int div_id = -1; + int level_set_id = -1; + int vel_grad_id = -1; + + + for (auto field : SimulationParamaters.output_options.output_gauss_pt_state){ + switch(field){ + // scalars + case gauss_pt_state::volume: + elem_scalar_var_names[var] = "vol"; + vol_id = var; + var++; + break; + case gauss_pt_state::divergence_velocity: + elem_scalar_var_names[var] = "div"; + div_id = var; + var++; + break; + + case gauss_pt_state::level_set: + elem_scalar_var_names[var] = "level_set"; + level_set_id = var; + var++; + break; + + // tensors + case gauss_pt_state::gradient_velocity: + elem_tensor_var_names[tensor_var] = "vel_grad"; + vel_grad_id = tensor_var; + tensor_var++; + break; + } // end switch + } // end loop over gauss_pt_states + + + // ******************* + // nodal values + // ******************* + + size_t num_node_scalar_vars = 0; + size_t num_node_vector_vars = 0; + + for (auto field : SimulationParamaters.output_options.output_node_state){ + switch(field){ + // --- scalars + case node_state::mass: + num_node_scalar_vars ++; + break; + case node_state::temp: + num_node_scalar_vars ++; + break; + // -- vectors + case node_state::coords: + num_node_vector_vars ++; + break; + case node_state::velocity: + num_node_vector_vars ++; // for velocity + num_node_vector_vars ++; // for acceleration + break; + case node_state::gradient_level_set: + num_node_vector_vars ++; + break; + case node_state::force: + break; + + // heat transer vars + case node_state::heat_transfer: + break; + } // end switch + } // end for over + Kokkos::fence(); + + + // Scalar and vector values associated with a node + std::vector node_scalar_var_names(num_node_scalar_vars); + std::vector node_vector_var_names(num_node_vector_vars); + + int node_mass_id = -1; + int node_vel_id = -1; + int node_accel_id = -1; + int node_coord_id = -1; + int node_temp_id = -1; + int node_grad_level_set_id = -1; + + // reset counters for node fields + var = 0; + vector_var = 0; + tensor_var = 0; + + for (auto field : SimulationParamaters.output_options.output_node_state){ + switch(field){ + // scalars + case node_state::mass: + node_scalar_var_names[var] = "node_mass"; + node_mass_id = var; + var++; + break; + case node_state::temp: + node_scalar_var_names[var] = "node_temp"; + node_temp_id = var; + var++; + break; + + // vector fields + + case node_state::coords: + node_vector_var_names[vector_var] = "node_coords"; + node_coord_id = vector_var; + vector_var++; + break; + + case node_state::velocity: + node_vector_var_names[vector_var] = "node_vel"; + node_vel_id = vector_var; + vector_var++; + + node_vector_var_names[vector_var] = "node_accel"; + node_accel_id = vector_var; + vector_var++; + break; + + case node_state::gradient_level_set: + node_vector_var_names[vector_var] = "node_grad_lvlset"; + node_grad_level_set_id = vector_var; + vector_var++; + break; + + // -- not used vars + case node_state::force: + break; + + // heat transer vars + case node_state::heat_transfer: + break; + + // tensors + + } // end switch + } // end for over + + + // ************************************** + // build and save element average fields + // ************************************** + + // short hand + const size_t num_nodes = mesh.num_nodes; + const size_t num_elems = mesh.num_elems; + const size_t num_dims = mesh.num_dims; + const size_t num_nodes_in_elem = mesh.num_nodes_in_elem; + const int Pn_order = mesh.Pn; + + // save the elem state to an array for exporting to graphics files + DCArrayKokkos elem_scalar_fields(num_elem_scalar_vars, num_elems, "elem_scalars"); + DCArrayKokkos elem_tensor_fields(num_elem_tensor_vars, num_elems, 3, 3, "elem_tensors"); + elem_scalar_fields.set_values(0.0); + elem_tensor_fields.set_values(0.0); + + + // ----------------------------------------------------------------------- + // save the output fields to a single element average array for all state + // ----------------------------------------------------------------------- + for (int mat_id = 0; mat_id < num_mats; mat_id++) { + + // material point and guass point state are concatenated together + concatenate_elem_fields(State.MaterialPoints, + State.GaussPoints, + elem_scalar_fields, + elem_tensor_fields, + State.MaterialToMeshMaps.elem_in_mat_elem, + SimulationParamaters.output_options.output_elem_state, + SimulationParamaters.output_options.output_gauss_pt_state, + State.MaterialToMeshMaps.num_mat_elems.host(mat_id), + mat_id, + num_elems, + den_id, + pres_id, + sie_id, + sspd_id, + mass_id, + stress_id, + vol_id, + div_id, + level_set_id, + vel_grad_id, + conductivity_id, + specific_heat_id); + } // end for mats + + // make specific fields for the element average + if (sie_id>=0){ + FOR_ALL(elem_gid, 0, num_elems, { + // get sie by dividing by the mass + elem_scalar_fields(sie_id, elem_gid) /= (elem_scalar_fields(mass_id, elem_gid)+1.e-20); + }); + } // end if + + Kokkos::fence(); + elem_scalar_fields.update_host(); + elem_tensor_fields.update_host(); + + + // ************************ + // Build the nodal fields + // ************************ + + // save the nodal fields to an array for exporting to graphics files + DCArrayKokkos node_scalar_fields(num_node_scalar_vars, num_nodes, "node_scalars"); + DCArrayKokkos node_vector_fields(num_node_vector_vars, num_nodes, 3, "node_tenors"); + + concatenate_nodal_fields(State.node, + node_scalar_fields, + node_vector_fields, + SimulationParamaters.output_options.output_node_state, + dt, + num_nodes, + num_dims, + node_mass_id, + node_vel_id, + node_accel_id, + node_coord_id, + node_grad_level_set_id, + node_temp_id); + + + Kokkos::fence(); + node_scalar_fields.update_host(); + node_vector_fields.update_host(); + + + // ******************************** + // Write the nodal and elem fields + // ******************************** + + if (SimulationParamaters.output_options.format == output_options::viz || + SimulationParamaters.output_options.format == output_options::viz_and_state) { + + // create the folder structure if it does not exist + struct stat st; + + if (stat("vtk", &st) != 0) { + int returnCode = system("mkdir vtk"); + + if (returnCode == 1) { + std::cout << "Unable to make vtk directory" << std::endl; + } + } + else{ + if(solver_id==0 && graphics_id==0){ + // delete the existing files inside + int returnCode = system("rm vtk/Fierro*"); + if (returnCode == 1) { + std::cout << "Unable to clear vtk/Fierro directory" << std::endl; + } + } + } + + if (stat("vtk/data", &st) != 0) { + int returnCode = system("mkdir vtk/data"); + if (returnCode == 1) { + std::cout << "Unable to make vtk/data directory" << std::endl; + } + } + else{ + if(solver_id==0 && graphics_id==0){ + // delete the existing files inside the folder + int returnCode = system("rm vtk/data/Fierro*"); + if (returnCode == 1) { + std::cout << "Unable to clear vtk/data directory" << std::endl; + } + } + } + + // call the .vtu writer for element fields + std::string elem_fields_name = "fields"; + + // make a view of node coords for passing into functions + ViewCArray node_coords_host(&State.node.coords.host(0,0), num_nodes, num_dims); + ViewCArray nodes_in_elem_host(&mesh.nodes_in_elem.host(0,0), num_elems, num_nodes_in_elem); + + + write_vtu(node_coords_host, + nodes_in_elem_host, + elem_scalar_fields, + elem_tensor_fields, + node_scalar_fields, + node_vector_fields, + elem_scalar_var_names, + elem_tensor_var_names, + node_scalar_var_names, + node_vector_var_names, + elem_fields_name, + graphics_id, + num_nodes, + num_elems, + num_nodes_in_elem, + Pn_order, + num_dims, + solver_id); + + + // ******************************** + // Build and write the mat fields + // ******************************** + + + // note: the file path and folder was created in the elem and node outputs + size_t num_mat_files_written = 0; + if(num_mat_pt_scalar_vars > 0 || num_mat_pt_tensor_vars >0){ + + for (int mat_id = 0; mat_id < num_mats; mat_id++) { + + const size_t num_mat_elems = State.MaterialToMeshMaps.num_mat_elems.host(mat_id); + + // only save material data if the mat lives on the mesh, ie. has state allocated + if (num_mat_elems>0){ + + // set the nodal vars to zero size, we don't write these fields again + node_scalar_var_names.clear(); + node_vector_var_names.clear(); + + // the arrays storing all the material field data + DCArrayKokkos mat_elem_scalar_fields(num_mat_pt_scalar_vars, num_mat_elems, "mat_pt_scalars"); + DCArrayKokkos mat_elem_tensor_fields(num_mat_pt_tensor_vars, num_mat_elems, 3, 3, "mat_pt_tensors"); + + + // concatenate material fields into a single array + concatenate_mat_fields(State.MaterialPoints, + mat_elem_scalar_fields, + mat_elem_tensor_fields, + State.MaterialToMeshMaps.elem_in_mat_elem, + SimulationParamaters.output_options.output_mat_pt_state, + num_mat_elems, + mat_id, + mat_den_id, + mat_pres_id, + mat_sie_id, + mat_sspd_id, + mat_mass_id, + mat_volfrac_id, + mat_geo_volfrac_id, + mat_eroded_id, + mat_stress_id, + mat_conductivity_id, + mat_specific_heat_id); + Kokkos::fence(); + mat_elem_scalar_fields.update_host(); + mat_elem_tensor_fields.update_host(); + + + std::string str_mat_val = std::to_string(mat_id); + std::string mat_fields_name = "mat"; + mat_fields_name += str_mat_val; // add the mat number + + // save the nodes belonging to this part (i.e., the material) + DCArrayKokkos mat_node_coords(num_nodes,num_dims, "mat_node_coords"); + DCArrayKokkos mat_nodes_in_mat_elem(num_mat_elems, num_nodes_in_elem, "mat_nodes_in_mat_elem"); + + // the number of actual nodes belonging to the part (i.e., the material) + size_t num_mat_nodes = 0; + + // build a unique mesh (element and nodes) for the material (i.e., the part) + build_material_elem_node_lists(mesh, + State.node.coords, + mat_node_coords, + mat_nodes_in_mat_elem, + State.MaterialToMeshMaps.elem_in_mat_elem, + mat_id, + num_mat_nodes, + num_mat_elems, + num_nodes_in_elem, + num_dims); + + ViewCArray mat_node_coords_host(&mat_node_coords.host(0,0), num_mat_nodes, num_dims); + ViewCArray mat_nodes_in_elem_host(&mat_nodes_in_mat_elem.host(0,0), num_mat_elems, num_nodes_in_elem); + + // write out a vtu file this + write_vtu(mat_node_coords_host, + mat_nodes_in_elem_host, + mat_elem_scalar_fields, + mat_elem_tensor_fields, + node_scalar_fields, + node_vector_fields, + mat_elem_scalar_var_names, + mat_elem_tensor_var_names, + node_scalar_var_names, + node_vector_var_names, + mat_fields_name, + graphics_id, + num_mat_nodes, + num_mat_elems, + num_nodes_in_elem, + Pn_order, + num_dims, + solver_id); + + + num_mat_files_written++; + + } // end for mat_id + + } // end if material is on the mesh + + } // end if mat variables are to be written + + + // ************************************************* + // write Paraview files to open the graphics files + // ************************************************* + + // save the graphics time + graphics_times(graphics_id) = time_value; + + // check to see if an mesh state was written + bool write_mesh_state = false; + if( num_elem_scalar_vars > 0 || + num_elem_tensor_vars > 0 || + num_node_scalar_vars > 0 || + num_node_vector_vars > 0) + { + write_mesh_state = true; + } + + // check to see if a mat state was written + bool write_mat_pt_state = false; + if( num_mat_pt_scalar_vars > 0 || + num_mat_pt_tensor_vars > 0) + { + write_mat_pt_state = true; + } + + // call the vtm file writer + std::string mat_fields_name = "mat"; + write_vtm(graphics_times, + elem_fields_name, + mat_fields_name, + time_value, + graphics_id, + num_mat_files_written, + write_mesh_state, + write_mat_pt_state, + solver_id); + + // call the pvd file writer + write_pvd(graphics_times, + time_value, + graphics_id, + solver_id); + + + // increment graphics id counter + graphics_id++; // this is private variable in the class + + } // end if viz paraview output is to be written + + + // STATE + if (SimulationParamaters.output_options.format == output_options::state || + SimulationParamaters.output_options.format == output_options::viz_and_state) { + + write_material_point_state(mesh, + State, + SimulationParamaters, + time_value, + graphics_times, + node_states, + gauss_pt_states, + material_pt_states); + + } // end if state is to be written + + + // will drop ensight outputs in the near future + if (SimulationParamaters.output_options.format == output_options::ensight){ + write_ensight(mesh, + State, + SimulationParamaters, + dt, + time_value, + graphics_times, + node_states, + gauss_pt_states, + material_pt_states); + } + + return; + + } // end write_mesh + + ///////////////////////////////////////////////////////////////////////////// + /// + /// \fn write_ensight + /// + /// \brief Writes an ensight output file + /// + /// \param Simulation mesh + /// \param State data + /// \param Simulation parameters + /// \param current time value + /// \param Vector of all graphics output times + /// + ///////////////////////////////////////////////////////////////////////////// + void write_ensight(Mesh_t& mesh, + State_t& State, + SimulationParameters_t& SimulationParamaters, + double dt, + double time_value, + CArray graphics_times, + std::vector node_states, + std::vector gauss_pt_states, + std::vector material_pt_states) + { + size_t num_mats = State.MaterialPoints.num_material_points.size(); + + // ---- Update host data ---- + + // material point values + State.MaterialPoints.den.update_host(); + State.MaterialPoints.pres.update_host(); + State.MaterialPoints.stress.update_host(); + State.MaterialPoints.sspd.update_host(); + State.MaterialPoints.sie.update_host(); + State.MaterialPoints.mass.update_host(); + State.MaterialPoints.eroded.update_host(); + + + // gauss point values + State.GaussPoints.vol.update_host(); + + // nodal values + State.node.coords.update_host(); + State.node.vel.update_host(); + State.node.mass.update_host(); + + Kokkos::fence(); + + // -------------------------- + + const int num_scalar_vars = 10; + const int num_vec_vars = 3; + + std::string name_tmp; + name_tmp = "Outputs_SGH"; + + char* name = new char [name_tmp.length() + 1]; + std::strcpy(name, name_tmp.c_str()); + + const char scalar_var_names[num_scalar_vars][15] = { + "den", "pres", "sie", "vol", "mass", "sspd", "speed", "mat_id", "elem_switch", "eroded" + }; + + const char vec_var_names[num_vec_vars][15] = { + "pos", "vel", "accel" + }; + + // short hand + const size_t num_nodes = mesh.num_nodes; + const size_t num_elems = mesh.num_elems; + const size_t num_dims = mesh.num_dims; + + // save the cell state to an array for exporting to graphics files + auto elem_fields = CArray(num_elems, num_scalar_vars); + int elem_switch = 1; + + + DCArrayKokkos speed(num_elems, "speed"); + FOR_ALL(elem_gid, 0, num_elems, { + double elem_vel[3]; // note:initialization with a list won't work + elem_vel[0] = 0.0; + elem_vel[1] = 0.0; + elem_vel[2] = 0.0; + // get the coordinates of the element center + for (int node_lid = 0; node_lid < mesh.num_nodes_in_elem; node_lid++) { + elem_vel[0] += State.node.vel(mesh.nodes_in_elem(elem_gid, node_lid), 0); + elem_vel[1] += State.node.vel(mesh.nodes_in_elem(elem_gid, node_lid), 1); + if (mesh.num_dims == 3) { + elem_vel[2] += State.node.vel(mesh.nodes_in_elem(elem_gid, node_lid), 2); + } + else{ + elem_vel[2] = 0.0; + } + } // end loop over nodes in element + elem_vel[0] = elem_vel[0] / mesh.num_nodes_in_elem; + elem_vel[1] = elem_vel[1] / mesh.num_nodes_in_elem; + elem_vel[2] = elem_vel[2] / mesh.num_nodes_in_elem; + + double speed_sqrd = 0.0; + for (int dim = 0; dim < num_dims; dim++) { + speed_sqrd += elem_vel[dim] * elem_vel[dim]; + } + speed(elem_gid) = sqrt(speed_sqrd); + }); // end parallel for + speed.update_host(); + Kokkos::fence(); + + // save the output scale fields to a single 2D array + + // export material centeric data to the elements + for (int mat_id = 0; mat_id < num_mats; mat_id++) { + size_t num_mat_elems = State.MaterialToMeshMaps.num_mat_elems.host(mat_id); + + for (size_t mat_elem_sid = 0; mat_elem_sid < num_mat_elems; mat_elem_sid++) { + // 1 material per element + + // get elem gid + size_t elem_gid = State.MaterialToMeshMaps.elem_in_mat_elem.host(mat_id, mat_elem_sid); + + // save outputs + elem_fields(elem_gid, 0) = State.MaterialPoints.den.host(mat_id, mat_elem_sid); + elem_fields(elem_gid, 1) = State.MaterialPoints.pres.host(mat_id, mat_elem_sid); + elem_fields(elem_gid, 2) = State.MaterialPoints.sie.host(mat_id, mat_elem_sid); + // 3 is guass point vol + elem_fields(elem_gid, 4) = State.MaterialPoints.mass.host(mat_id, mat_elem_sid); + elem_fields(elem_gid, 5) = State.MaterialPoints.sspd.host(mat_id, mat_elem_sid); + // 6 is elem speed + elem_fields(elem_gid, 7) = (double)mat_id; + // 8 is the e_switch + elem_fields(elem_gid, 9) = (double)State.MaterialPoints.eroded.host(mat_id, mat_elem_sid); + } // end for mat elems storage + } // end parallel loop over materials + + // export element centric data + double e_switch = 1; + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { + elem_fields(elem_gid, 3) = State.GaussPoints.vol.host(elem_gid); + elem_fields(elem_gid, 6) = speed.host(elem_gid); + elem_fields(elem_gid, 8) = e_switch; + elem_switch *= -1; + } // end for elem_gid + + // save the vertex vector fields to an array for exporting to graphics files + CArray vec_fields(num_nodes, num_vec_vars, 3); + + for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) { + // position, var 0 + vec_fields(node_gid, 0, 0) = State.node.coords.host(node_gid, 0); + vec_fields(node_gid, 0, 1) = State.node.coords.host(node_gid, 1); + if (num_dims == 2) { + vec_fields(node_gid, 0, 2) = 0.0; + } + else{ + vec_fields(node_gid, 0, 2) = State.node.coords.host(node_gid, 2); + } + + // velocity, var 1 + vec_fields(node_gid, 1, 0) = State.node.vel.host(node_gid, 0); + vec_fields(node_gid, 1, 1) = State.node.vel.host(node_gid, 1); + if (num_dims == 2) { + vec_fields(node_gid, 1, 2) = 0.0; + } + else{ + vec_fields(node_gid, 1, 2) = State.node.vel.host(node_gid, 2); + } + + // accelleration, var 2 + vec_fields(node_gid, 2, 0) = (State.node.vel.host(node_gid, 0) - State.node.vel_n0.host(node_gid, 0))/dt; + vec_fields(node_gid, 2, 1) = (State.node.vel.host(node_gid, 1) - State.node.vel_n0.host(node_gid, 1))/dt; + if (num_dims == 2) { + vec_fields(node_gid, 2, 2) = 0.0; + } + else{ + vec_fields(node_gid, 2, 2) = (State.node.vel.host(node_gid, 2) - State.node.vel_n0.host(node_gid, 2))/dt; + } + + + } // end for loop over vertices + + + // --------------------------------------------------------------------------- + // Setup of file and directoring for exporting + // --------------------------------------------------------------------------- + FILE* out[20]; // the output files that are written to + char filename[128]; + int max_len = sizeof filename; + int str_output_len; + + struct stat st; + + if (stat("ensight", &st) != 0) { + system("mkdir ensight"); + } + + if (stat("ensight/data", &st) != 0) { + system("mkdir ensight/data"); + } + + // --------------------------------------------------------------------------- + // Write the Geometry file + // --------------------------------------------------------------------------- + // sprintf(filename, "ensight/data/%s.%05d.geo", name, graphics_id); + str_output_len = snprintf(filename, max_len, "ensight/data/%s.%05d.geo", name, graphics_id); + // filename has the full string + if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); } + + out[0] = fopen(filename, "w"); + + fprintf(out[0], "A graphics dump by Fierro \n"); + + fprintf(out[0], "%s", "EnSight Gold geometry\n"); + fprintf(out[0], "%s", "node id assign\n"); + fprintf(out[0], "%s", "element id assign\n"); + + fprintf(out[0], "part\n"); + fprintf(out[0], "%10d\n", 1); + fprintf(out[0], "Mesh\n"); + + // --- vertices --- + fprintf(out[0], "coordinates\n"); + fprintf(out[0], "%10lu\n", num_nodes); + + // write all components of the point coordinates + for (int node_gid = 0; node_gid < num_nodes; node_gid++) { + fprintf(out[0], "%12.5e\n", State.node.coords.host(node_gid, 0)); + } + + for (int node_gid = 0; node_gid < num_nodes; node_gid++) { + fprintf(out[0], "%12.5e\n", State.node.coords.host(node_gid, 1)); + } + + for (int node_gid = 0; node_gid < num_nodes; node_gid++) { + if (num_dims == 3) { + fprintf(out[0], "%12.5e\n", State.node.coords.host(node_gid, 2)); + } + else{ + fprintf(out[0], "%12.5e\n", 0.0); + } + } + + // --- elements --- + if (num_dims == 3) { + fprintf(out[0], "hexa8\n"); + } + else{ + fprintf(out[0], "quad4\n"); + } + fprintf(out[0], "%10lu\n", num_elems); + + + int convert_ijk_to_ensight[8]; + if(mesh.num_dims==3){ + convert_ijk_to_ensight[0] = 0; + convert_ijk_to_ensight[1] = 1; + convert_ijk_to_ensight[2] = 3; + convert_ijk_to_ensight[3] = 2; + convert_ijk_to_ensight[4] = 4; + convert_ijk_to_ensight[5] = 5; + convert_ijk_to_ensight[6] = 7; + convert_ijk_to_ensight[7] = 6; + } + else{ + + convert_ijk_to_ensight[0] = 0; + convert_ijk_to_ensight[1] = 1; + convert_ijk_to_ensight[2] = 2; + convert_ijk_to_ensight[3] = 3; + convert_ijk_to_ensight[4] = 4; + convert_ijk_to_ensight[5] = 5; + convert_ijk_to_ensight[6] = 6; + convert_ijk_to_ensight[7] = 7; + } // end if + + + // write all global point numbers for this cell + for (int elem_gid = 0; elem_gid < num_elems; elem_gid++) { + for (int node_lid = 0; node_lid < mesh.num_nodes_in_elem; node_lid++) { + fprintf(out[0], "%10lu\t", mesh.nodes_in_elem.host(elem_gid, convert_ijk_to_ensight[node_lid]) + 1); // note: node_gid starts at 1 + } + fprintf(out[0], "\n"); + } + + fclose(out[0]); + + // --------------------------------------------------------------------------- + // Write the Scalar variable files + // --------------------------------------------------------------------------- + + // ensight_vars = (den, pres,...) + for (int var = 0; var < num_scalar_vars; var++) { + // write a scalar value + // sprintf(filename, "ensight/data/%s.%05d.%s", name, graphics_id, scalar_var_names[var]); + str_output_len = snprintf(filename, max_len, "ensight/data/%s.%05d.%s", name, graphics_id, scalar_var_names[var]); + if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); } + + out[0] = fopen(filename, "w"); + + fprintf(out[0], "Per_elem scalar values\n"); + fprintf(out[0], "part\n"); + fprintf(out[0], "%10d\n", 1); + if (num_dims == 3) { + fprintf(out[0], "hexa8\n"); + } + else{ + fprintf(out[0], "quad4\n"); + } + + for (int elem_id = 0; elem_id < num_elems; elem_id++) { + fprintf(out[0], "%12.5e\n", elem_fields(elem_id, var)); + } + + fclose(out[0]); + } // end for var + + // --------------------------------------------------------------------------- + // Write the Vector variable files + // --------------------------------------------------------------------------- + + // ensight vector vars = (position, velocity, force) + for (int var = 0; var < num_vec_vars; var++) { + // sprintf(filename, "ensight/data/%s.%05d.%s", name, graphics_id, vec_var_names[var]); + str_output_len = snprintf(filename, max_len, "ensight/data/%s.%05d.%s", name, graphics_id, vec_var_names[var]); + if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); } + + out[0] = fopen(filename, "w"); + // fprintf(out[0],"Per_node vector values\n"); + // fprintf(out[0],"part\n"); + // fprintf(out[0],"%10d \n",1); + // fprintf(out[0],"hexa8\n"); // WARNING, maybe bug here? + + fprintf(out[0], "Per_node vector values\n"); + fprintf(out[0], "part\n"); + fprintf(out[0], "%10d\n", 1); + fprintf(out[0], "block\n"); + + for (int node_gid = 0; node_gid < num_nodes; node_gid++) { + fprintf(out[0], "%12.5e\n", vec_fields(node_gid, var, 0)); + } + + for (int node_gid = 0; node_gid < num_nodes; node_gid++) { + fprintf(out[0], "%12.5e\n", vec_fields(node_gid, var, 1)); + } + + for (int node_gid = 0; node_gid < num_nodes; node_gid++) { + fprintf(out[0], "%12.5e\n", vec_fields(node_gid, var, 2)); + } + + fclose(out[0]); + } // end for var + + // --------------------------------------------------------------------------- + // Write the case file + // --------------------------------------------------------------------------- + + // sprintf(filename, "ensight/%s.case", name); + str_output_len = snprintf(filename, max_len, "ensight/%s.case", name); + if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); } + + out[0] = fopen(filename, "w"); + + fprintf(out[0], "FORMAT\n"); + fprintf(out[0], "type: ensight gold\n"); + fprintf(out[0], "GEOMETRY\n"); + + // sprintf(filename, "model: data/%s.*****.geo\n", name); + str_output_len = snprintf(filename, max_len, "model: data/%s.*****.geo\n", name); + if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); } + + fprintf(out[0], "%s", filename); + fprintf(out[0], "VARIABLE\n"); + + for (int var = 0; var < num_scalar_vars; var++) { + // sprintf(filename, "scalar per element: %s data/%s.*****.%s\n", scalar_var_names[var], name, scalar_var_names[var]); + str_output_len = snprintf(filename, max_len, "scalar per element: %s data/%s.*****.%s\n", scalar_var_names[var], name, scalar_var_names[var]); + if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); } + + fprintf(out[0], "%s", filename); + } + + for (int var = 0; var < num_vec_vars; var++) { + // sprintf(filename, "vector per node: %s data/%s.*****.%s\n", vec_var_names[var], name, vec_var_names[var]); + str_output_len = snprintf(filename, max_len, "vector per node: %s data/%s.*****.%s\n", vec_var_names[var], name, vec_var_names[var]); + if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); } + fprintf(out[0], "%s", filename); + } + + fprintf(out[0], "TIME\n"); + fprintf(out[0], "time set: 1\n"); + fprintf(out[0], "number of steps: %4d\n", graphics_id + 1); + fprintf(out[0], "filename start number: 0\n"); + fprintf(out[0], "filename increment: 1\n"); + fprintf(out[0], "time values: \n"); + + graphics_times(graphics_id) = time_value; + + for (int i = 0; i <= graphics_id; i++) { + fprintf(out[0], "%12.5e\n", graphics_times(i)); + } + fclose(out[0]); + + // --------------------------------------------------------------------------- + // Done writing the graphics dump + // --------------------------------------------------------------------------- + + // increment graphics id counter + graphics_id++; + + delete[] name; + + + return; + } + + ///////////////////////////////////////////////////////////////////////////// + /// + /// \fn write_vtk_old + /// + /// \brief Writes a vtk output file + /// + /// \param Simulation mesh + /// \param State data + /// \param Simulation parameters + /// \param current time value + /// \param Vector of all graphics output times + /// + ///////////////////////////////////////////////////////////////////////////// + void write_vtk_old(Mesh_t& mesh, + State_t& State, + SimulationParameters_t& SimulationParamaters, + double dt, + double time_value, + CArray graphics_times, + std::vector node_states, + std::vector gauss_pt_states, + std::vector material_pt_states) + { + + size_t num_mats = State.MaterialPoints.num_material_points.size(); + + // ---- Update host data ---- + + // material point values + State.MaterialPoints.den.update_host(); + State.MaterialPoints.pres.update_host(); + State.MaterialPoints.stress.update_host(); + State.MaterialPoints.sspd.update_host(); + State.MaterialPoints.sie.update_host(); + State.MaterialPoints.mass.update_host(); + State.MaterialPoints.conductivity.update_host(); + State.MaterialPoints.temp_grad.update_host(); + State.MaterialPoints.eroded.update_host(); + + + // gauss point values + State.GaussPoints.vol.update_host(); + + // nodal values + State.node.coords.update_host(); + State.node.vel.update_host(); + State.node.mass.update_host(); + State.node.temp.update_host(); + + Kokkos::fence(); + + + const int num_cell_scalar_vars = 13; + const int num_cell_vec_vars = 0; + const int num_cell_tensor_vars = 0; + + const int num_point_scalar_vars = 1; + const int num_point_vec_vars = 2; + + + // Scalar values associated with a cell + const char cell_scalar_var_names[num_cell_scalar_vars][15] = { + "den", "pres", "sie", "vol", "mass", "sspd", "speed", "mat_id", "elem_switch","eroded", "temp_grad_x", "temp_grad_y", "temp_grad_z" + }; + + const char cell_vec_var_names[num_cell_vec_vars][15] = { + + }; + + const char point_scalar_var_names[num_point_scalar_vars][15] = { + "temp" + }; + + const char point_vec_var_names[num_point_vec_vars][15] = { + "pos", "vel" + }; + + // short hand + const size_t num_nodes = mesh.num_nodes; + const size_t num_elems = mesh.num_elems; + const size_t num_dims = mesh.num_dims; + + // save the cell state to an array for exporting to graphics files + auto elem_fields = CArray(num_elems, num_cell_scalar_vars); + int elem_switch = 1; + + DCArrayKokkos speed(num_elems, "speed"); + FOR_ALL(elem_gid, 0, num_elems, { + double elem_vel[3]; // note:initialization with a list won't work + elem_vel[0] = 0.0; + elem_vel[1] = 0.0; + elem_vel[2] = 0.0; + // get the coordinates of the element center + for (int node_lid = 0; node_lid < mesh.num_nodes_in_elem; node_lid++) { + elem_vel[0] += State.node.vel(mesh.nodes_in_elem(elem_gid, node_lid), 0); + elem_vel[1] += State.node.vel(mesh.nodes_in_elem(elem_gid, node_lid), 1); + if (mesh.num_dims == 3) { + elem_vel[2] += State.node.vel(mesh.nodes_in_elem(elem_gid, node_lid), 2); + } + else{ + elem_vel[2] = 0.0; + } + } // end loop over nodes in element + elem_vel[0] = elem_vel[0] / mesh.num_nodes_in_elem; + elem_vel[1] = elem_vel[1] / mesh.num_nodes_in_elem; + elem_vel[2] = elem_vel[2] / mesh.num_nodes_in_elem; + + double speed_sqrd = 0.0; + for (int dim = 0; dim < num_dims; dim++) { + speed_sqrd += elem_vel[dim] * elem_vel[dim]; + } + speed(elem_gid) = sqrt(speed_sqrd); + }); // end parallel for + speed.update_host(); + Kokkos::fence(); + + // save the output scale fields to a single 2D array + + + // export material centeric data to the elements + for (int mat_id = 0; mat_id < num_mats; mat_id++) { + size_t num_mat_elems = State.MaterialToMeshMaps.num_mat_elems.host(mat_id); + + for (size_t mat_elem_sid = 0; mat_elem_sid < num_mat_elems; mat_elem_sid++) { + // 1 material per element + + // get elem gid + size_t elem_gid = State.MaterialToMeshMaps.elem_in_mat_elem.host(mat_id, mat_elem_sid); + + // save outputs + elem_fields(elem_gid, 0) = State.MaterialPoints.den.host(mat_id,mat_elem_sid); + elem_fields(elem_gid, 1) = State.MaterialPoints.pres.host(mat_id, mat_elem_sid); + elem_fields(elem_gid, 2) = State.MaterialPoints.sie.host(mat_id, mat_elem_sid); + // 3 is guass point vol + elem_fields(elem_gid, 4) = State.MaterialPoints.mass.host(mat_id, mat_elem_sid); + elem_fields(elem_gid, 5) = State.MaterialPoints.sspd.host(mat_id, mat_elem_sid); + // 6 is elem speed + elem_fields(elem_gid, 7) = (double)mat_id; + // 8 is the e_switch + elem_fields(elem_gid, 9) = (double)State.MaterialPoints.eroded.host(mat_id, mat_elem_sid); + elem_fields(elem_gid, 10) = (double)State.MaterialPoints.temp_grad.host(mat_id, elem_gid,0); + elem_fields(elem_gid, 11) = (double)State.MaterialPoints.temp_grad.host(mat_id, elem_gid,1); + elem_fields(elem_gid, 12) = (double)State.MaterialPoints.temp_grad.host(mat_id, elem_gid,2); + } // end for mat elems storage + } // end parallel loop over materials + + // export element centric data + double e_switch = 1; + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { + elem_fields(elem_gid, 3) = State.GaussPoints.vol.host(elem_gid); + elem_fields(elem_gid, 6) = speed.host(elem_gid); + elem_fields(elem_gid, 8) = State.GaussPoints.div.host(elem_gid); + elem_switch *= -1; + } // end for elem_gid + + // save the vertex vector fields to an array for exporting to graphics files + CArray vec_fields(num_nodes, num_point_vec_vars, 3); + CArray point_scalar_fields(num_nodes, num_point_scalar_vars); + + for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) { + // position, var 0 + vec_fields(node_gid, 0, 0) = State.node.coords.host(node_gid, 0); + vec_fields(node_gid, 0, 1) = State.node.coords.host(node_gid, 1); + if (num_dims == 2) { + vec_fields(node_gid, 0, 2) = 0.0; + } + else{ + vec_fields(node_gid, 0, 2) = State.node.coords.host(node_gid, 2); + } + + // position, var 1 + vec_fields(node_gid, 1, 0) = State.node.vel.host(node_gid, 0); + vec_fields(node_gid, 1, 1) = State.node.vel.host(node_gid, 1); + if (num_dims == 2) { + vec_fields(node_gid, 1, 2) = 0.0; + } + else{ + vec_fields(node_gid, 1, 2) = State.node.vel.host(node_gid, 2); + } + + point_scalar_fields(node_gid, 0) = State.node.temp.host(node_gid); + } // end for loop over vertices + + + FILE* out[20]; // the output files that are written to + char filename[100]; // char string + int max_len = sizeof filename; + int str_output_len; + + struct stat st; + + if (stat("vtk", &st) != 0) { + system("mkdir vtk"); + } + + // snprintf(filename, max_len, "ensight/data/%s.%05d.%s", name, graphics_id, vec_var_names[var]); + + //sprintf(filename, "vtk/Fierro.%05d.vtk", graphics_id); // mesh file + str_output_len = snprintf(filename, max_len, "vtk/Fierro.%05d.vtk", graphics_id); + if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); } + // mesh file + + out[0] = fopen(filename, "w"); + + fprintf(out[0], "# vtk DataFile Version 2.0\n"); // part 2 + fprintf(out[0], "Mesh for Fierro\n"); // part 2 + fprintf(out[0], "ASCII \n"); // part 3 + fprintf(out[0], "DATASET UNSTRUCTURED_GRID\n\n"); // part 4 + + fprintf(out[0], "POINTS %zu float\n", mesh.num_nodes); + + // write all components of the point coordinates + for (size_t node_gid = 0; node_gid < mesh.num_nodes; node_gid++) { + fprintf(out[0], + "%f %f %f\n", + State.node.coords.host(node_gid, 0), + State.node.coords.host(node_gid, 1), + State.node.coords.host(node_gid, 2)); + } // end for + + /* + --------------------------------------------------------------------------- + Write the elems + --------------------------------------------------------------------------- + */ + + fprintf(out[0], "\n"); + fprintf(out[0], "CELLS %lu %lu\n", mesh.num_elems, mesh.num_elems + mesh.num_elems * mesh.num_nodes_in_elem); // size=all printed values + + int Pn_order = mesh.Pn; + int order[3] = { Pn_order, Pn_order, Pn_order }; + + // const int num_1D_points = Pn_order+1; + + // write all global point numbers for this elem + for (size_t elem_gid = 0; elem_gid < mesh.num_elems; elem_gid++) { + fprintf(out[0], "%lu ", mesh.num_nodes_in_elem); // num points in this elem + + for (int k = 0; k <= Pn_order; k++) { + for (int j = 0; j <= Pn_order; j++) { + for (int i = 0; i <= Pn_order; i++) { + size_t node_lid = PointIndexFromIJK(i, j, k, order); + fprintf(out[0], "%lu ", mesh.nodes_in_elem.host(elem_gid, node_lid)); + } + } + } + + fprintf(out[0], "\n"); + } // end for + + // Write the element types + fprintf(out[0], "\n"); + fprintf(out[0], "CELL_TYPES %zu \n", mesh.num_elems); + // VTK_LAGRANGE_HEXAHEDRON: 72, + // VTK_HIGHER_ORDER_HEXAHEDRON: 67 + // VTK_BIQUADRATIC_QUADRATIC_HEXAHEDRON = 33 + // element types: https://vtk.org/doc/nightly/html/vtkCellType_8h_source.html + // element types: https://kitware.github.io/vtk-js/api/Common_DataModel_CellTypes.html + // vtk format: https://www.kitware.com//modeling-arbitrary-order-lagrange-finite-elements-in-the-visualization-toolkit/ + for (size_t elem_gid = 0; elem_gid < mesh.num_elems; elem_gid++) { + fprintf(out[0], "%d \n", 72); + } + + /* + --------------------------------------------------------------------------- + Write the nodal vector variables to file + --------------------------------------------------------------------------- + */ + + fprintf(out[0], "\n"); + fprintf(out[0], "POINT_DATA %zu \n", mesh.num_nodes); + + // vtk vector vars = (position, velocity) + for (int var = 0; var < num_point_vec_vars; var++) { + fprintf(out[0], "VECTORS %s float \n", point_vec_var_names[var]); + for (size_t node_gid = 0; node_gid < mesh.num_nodes; node_gid++) { + fprintf(out[0], "%f %f %f\n", + vec_fields(node_gid, var, 0), + vec_fields(node_gid, var, 1), + vec_fields(node_gid, var, 2)); + } // end for nodes + } // end for vec_vars + + + // vtk scalar vars = (temp) + for (int var = 0; var < num_point_scalar_vars; var++) { + fprintf(out[0], "SCALARS %s float 1\n", point_scalar_var_names[var]); + fprintf(out[0], "LOOKUP_TABLE default\n"); + for (size_t node_gid = 0; node_gid < mesh.num_nodes; node_gid++) { + fprintf(out[0], "%f\n", + point_scalar_fields(node_gid, 0)); + } // end for nodes + } // end for vec_vars + + /* + --------------------------------------------------------------------------- + Write the scalar elem variable to file + --------------------------------------------------------------------------- + */ + fprintf(out[0], "\n"); + fprintf(out[0], "CELL_DATA %zu \n", mesh.num_elems); + + for (int var = 0; var < num_cell_scalar_vars; var++) { + fprintf(out[0], "SCALARS %s float 1\n", cell_scalar_var_names[var]); // the 1 is number of scalar components [1:4] + fprintf(out[0], "LOOKUP_TABLE default\n"); + for (size_t elem_gid = 0; elem_gid < mesh.num_elems; elem_gid++) { + fprintf(out[0], "%f\n", elem_fields(elem_gid, var)); + } // end for elem + } // end for cell scalar_vars + + fclose(out[0]); + + graphics_times(graphics_id) = time_value; + + // Write time series metadata + //sprintf(filename, "vtk/Fierro.vtk.series", graphics_id); // mesh file + str_output_len = snprintf(filename, max_len, "vtk/Fierro.vtk.series"); + if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); } + // mesh file + + out[0] = fopen(filename, "w"); + + fprintf(out[0], "{\n"); + fprintf(out[0], " \"file-series-version\" : \"1.0\",\n"); + fprintf(out[0], " \"files\" : [\n"); + + for (int i = 0; i <= graphics_id; i++) { + fprintf(out[0], " { \"name\" : \"Fierro.%05d.vtk\", \"time\" : %12.5e },\n", i, graphics_times(i) ); + } + + // fprintf(out[0], "%12.5e\n", graphics_times(i)); + fprintf(out[0], " ]\n"); // part 4 + fprintf(out[0], "}"); // part 4 + + fclose(out[0]); + + // increment graphics id counter + graphics_id++; + + + } // end write vtk old + + + ///////////////////////////////////////////////////////////////////////////// + /// + /// \fn concatenate_elem_fields + /// + /// \brief A function to calculate the average of elem fields and concatentate into 1 array + /// + /// + /// \param MaterialPoints a struct containing the material point state arrays + /// \param elem_scalar_fields the scalar fields + /// \param elem_tensor_fields the tensor fields + /// \param elem_in_mat_elem a listing of the element ids the material resides in + /// \param output_elem_state a std::vector of enums specifying the elem avg outputs + /// \param num_mat_elems the number of elements the material resides in + /// \param mat_id the index for the material + /// + ///////////////////////////////////////////////////////////////////////////// + void concatenate_elem_fields(const MaterialPoint_t& MaterialPoints, + const GaussPoint_t& GaussPoints, + DCArrayKokkos& elem_scalar_fields, + DCArrayKokkos& elem_tensor_fields, + const DRaggedRightArrayKokkos& elem_in_mat_elem, + const std::vector& output_elem_state, + const std::vector& output_gauss_pt_states, + const size_t num_mat_elems, + const size_t mat_id, + const size_t num_elems, + const int den_id, + const int pres_id, + const int sie_id, + const int sspd_id, + const int mass_id, + const int stress_id, + const int vol_id, + const int div_id, + const int level_set_id, + const int vel_grad_id, + const int conductivity_id, + const int specific_heat_id) + { + + // --- loop over the material point states + + for (auto field : output_elem_state){ + switch(field){ + // scalar vars + case material_pt_state::density: + FOR_ALL(mat_elem_sid, 0, num_mat_elems, { + + // get elem gid + size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid); + + // field + elem_scalar_fields(den_id, elem_gid) += MaterialPoints.den(mat_id, mat_elem_sid)* + MaterialPoints.volfrac(mat_id, mat_elem_sid)* + MaterialPoints.geo_volfrac(mat_id, mat_elem_sid); + }); + break; + case material_pt_state::pressure: + FOR_ALL(mat_elem_sid, 0, num_mat_elems, { + + // get elem gid + size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid); + + // field + elem_scalar_fields(pres_id, elem_gid) += MaterialPoints.pres(mat_id, mat_elem_sid)* + MaterialPoints.volfrac(mat_id, mat_elem_sid)* + MaterialPoints.geo_volfrac(mat_id, mat_elem_sid); + }); + break; + case material_pt_state::specific_internal_energy: + FOR_ALL(mat_elem_sid, 0, num_mat_elems, { + + // get elem gid + size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid); + + // field + // extensive ie here, but after this function, it will become specific ie + elem_scalar_fields(sie_id, elem_gid) += MaterialPoints.mass(mat_id, mat_elem_sid)* + MaterialPoints.sie(mat_id, mat_elem_sid); + }); + break; + case material_pt_state::sound_speed: + FOR_ALL(mat_elem_sid, 0, num_mat_elems, { + + // get elem gid + size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid); + + // field + elem_scalar_fields(sspd_id, elem_gid) += MaterialPoints.sspd(mat_id, mat_elem_sid)* + MaterialPoints.volfrac(mat_id, mat_elem_sid)* + MaterialPoints.geo_volfrac(mat_id, mat_elem_sid); + }); + break; + case material_pt_state::mass: + FOR_ALL(mat_elem_sid, 0, num_mat_elems, { + + // get elem gid + size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid); + + // field + elem_scalar_fields(mass_id, elem_gid) += MaterialPoints.mass(mat_id, mat_elem_sid); + }); + break; + // --------------- + // tensor vars + // --------------- + case material_pt_state::stress: + FOR_ALL(mat_elem_sid, 0, num_mat_elems, { + + // get elem gid + size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid); + + // field + // average tensor fields, it is always 3D + // note: paraview is row-major, CArray convention + for (size_t i=0; i<3; i++){ + for(size_t j=0; j<3; j++){ + + // stress tensor + elem_tensor_fields(stress_id, elem_gid, i, j) += + MaterialPoints.stress(mat_id, mat_elem_sid,i,j) * + MaterialPoints.volfrac(mat_id, mat_elem_sid)* + MaterialPoints.geo_volfrac(mat_id, mat_elem_sid); + } // end for + } // end for + }); + break; + + // thermal solver vars + case material_pt_state::thermal_conductivity: + FOR_ALL(mat_elem_sid, 0, num_mat_elems, { + + // get elem gid + size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid); + + // field + elem_scalar_fields(conductivity_id, elem_gid) += MaterialPoints.conductivity(mat_id, mat_elem_sid)* + MaterialPoints.volfrac(mat_id, mat_elem_sid)* + MaterialPoints.geo_volfrac(mat_id, mat_elem_sid); + }); + break; + + case material_pt_state::specific_heat: + FOR_ALL(mat_elem_sid, 0, num_mat_elems, { + + // get elem gid + size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid); + + // field + elem_scalar_fields(specific_heat_id, elem_gid) += MaterialPoints.specific_heat(mat_id, mat_elem_sid)* + MaterialPoints.volfrac(mat_id, mat_elem_sid)* + MaterialPoints.geo_volfrac(mat_id, mat_elem_sid); + }); + break; + + + // add other variables here + + // not used variables + case material_pt_state::volume_fraction: + break; + case material_pt_state::eroded_flag: + break; + case material_pt_state::elastic_modulii: + break; + case material_pt_state::shear_modulii: + break; + case material_pt_state::poisson_ratios: + break; + case material_pt_state::heat_flux: + break; + } // end switch + }// end for over mat point state + + + // --- add loop over gauss points --- + + // export element centric data + for (auto field : output_gauss_pt_states){ + switch(field){ + // scalars + case gauss_pt_state::volume: + + FOR_ALL(elem_gid, 0, num_elems, { + elem_scalar_fields(vol_id, elem_gid) = GaussPoints.vol(elem_gid); + }); + + break; + case gauss_pt_state::divergence_velocity: + + FOR_ALL(elem_gid, 0, num_elems, { + elem_scalar_fields(div_id, elem_gid) = GaussPoints.div(elem_gid); + }); + + break; + + case gauss_pt_state::level_set: + + FOR_ALL(elem_gid, 0, num_elems, { + elem_scalar_fields(level_set_id, elem_gid) = GaussPoints.level_set(elem_gid); + }); + + break; + + // tensors + case gauss_pt_state::gradient_velocity: + // note: paraview is row-major, CArray convention + FOR_ALL(elem_gid, 0, num_elems, { + for (size_t i=0; i<3; i++){ + for(size_t j=0; j<3; j++){ + elem_tensor_fields(vel_grad_id, elem_gid, i, j) = + GaussPoints.vel_grad(elem_gid, i, j); + } + } // end for + }); + + break; + + // add other gauss variables here + + } // end switch + } // end loop over gauss_pt_states + + + // --- add end gauss point loop -- + + } // end of function + + ///////////////////////////////////////////////////////////////////////////// + /// + /// \fn concatenate_mat_fields + /// + /// \brief A function to concatentate material fields into 1 array + /// + /// + /// \param MaterialPoints a struct containing the material point state arrays + /// \param elem_scalar_fields the scalar fields + /// \param elem_tensor_fields the tensor fields + /// \param elem_in_mat_elem a listing of the element ids the material resides in + /// \param output_material_pt_states a std::vector of enums specifying the model + /// \param num_mat_elems the number of elements the material resides in + /// \param mat_id the index for the material + /// + ///////////////////////////////////////////////////////////////////////////// + void concatenate_mat_fields(const MaterialPoint_t& MaterialPoints, + DCArrayKokkos& mat_elem_scalar_fields, + DCArrayKokkos& mat_elem_tensor_fields, + const DRaggedRightArrayKokkos& elem_in_mat_elem, + const std::vector& output_material_pt_states, + const size_t num_mat_elems, + const size_t mat_id, + const int mat_den_id, + const int mat_pres_id, + const int mat_sie_id, + const int mat_sspd_id, + const int mat_mass_id, + const int mat_volfrac_id, + const int mat_geo_volfrac_id, + const int mat_eroded_id, + const int mat_stress_id, + const int mat_conductivity_id, + const int mat_specific_heat_id) + { + + // --- loop over the material point states + + for (auto field : output_material_pt_states){ + switch(field){ + // scalar vars + case material_pt_state::density: + FOR_ALL(mat_elem_sid, 0, num_mat_elems, { + + // field + mat_elem_scalar_fields(mat_den_id, mat_elem_sid) = MaterialPoints.den(mat_id, mat_elem_sid); + }); + break; + case material_pt_state::pressure: + FOR_ALL(mat_elem_sid, 0, num_mat_elems, { + + // field + mat_elem_scalar_fields(mat_pres_id, mat_elem_sid) = MaterialPoints.pres(mat_id, mat_elem_sid); + }); + break; + case material_pt_state::specific_internal_energy: + FOR_ALL(mat_elem_sid, 0, num_mat_elems, { + + // field + // extensive ie here, but after this function, it will become specific ie + mat_elem_scalar_fields(mat_sie_id, mat_elem_sid) = MaterialPoints.sie(mat_id, mat_elem_sid); + }); + break; + case material_pt_state::sound_speed: + FOR_ALL(mat_elem_sid, 0, num_mat_elems, { + + // field + mat_elem_scalar_fields(mat_sspd_id, mat_elem_sid) = MaterialPoints.sspd(mat_id, mat_elem_sid); + }); + break; + case material_pt_state::mass: + FOR_ALL(mat_elem_sid, 0, num_mat_elems, { + + // field + mat_elem_scalar_fields(mat_mass_id, mat_elem_sid) = MaterialPoints.mass(mat_id, mat_elem_sid); + }); + break; + case material_pt_state::volume_fraction: + // material volume fraction + FOR_ALL(mat_elem_sid, 0, num_mat_elems, { + + // field + // this is the volume fraction of a material within a part + mat_elem_scalar_fields(mat_volfrac_id, mat_elem_sid) = MaterialPoints.volfrac(mat_id, mat_elem_sid); + }); + + // geometric volume fraction + FOR_ALL(mat_elem_sid, 0, num_mat_elems, { + + // field + // this is the geometric volume fraction (interface reconstruction) + mat_elem_scalar_fields(mat_geo_volfrac_id, mat_elem_sid) = MaterialPoints.geo_volfrac(mat_id, mat_elem_sid); + }); + break; + case material_pt_state::eroded_flag: + FOR_ALL(mat_elem_sid, 0, num_mat_elems, { + + // field + mat_elem_scalar_fields(mat_eroded_id, mat_elem_sid) = (double)MaterialPoints.eroded(mat_id, mat_elem_sid); + }); + break; + // --------------- + // tensor vars + // --------------- + case material_pt_state::stress: + FOR_ALL(mat_elem_sid, 0, num_mat_elems, { + + // field + // average tensor fields, it is always 3D + // note: paraview is row-major, CArray convention + for (size_t i=0; i<3; i++){ + for(size_t j=0; j<3; j++){ + + // stress tensor + mat_elem_tensor_fields(mat_stress_id, mat_elem_sid, i, j) = + MaterialPoints.stress(mat_id, mat_elem_sid,i,j); + } // end for + } // end for + }); + break; + + // thermal solver vars + case material_pt_state::thermal_conductivity: + FOR_ALL(mat_elem_sid, 0, num_mat_elems, { + + // get elem gid + size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid); + + // field + mat_elem_scalar_fields(mat_conductivity_id, elem_gid) += MaterialPoints.conductivity(mat_id, mat_elem_sid); + }); + break; + + case material_pt_state::specific_heat: + FOR_ALL(mat_elem_sid, 0, num_mat_elems, { + + // get elem gid + size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid); + + // field + mat_elem_scalar_fields(mat_specific_heat_id, elem_gid) += MaterialPoints.specific_heat(mat_id, mat_elem_sid); + }); + break; + + // add other variables here + + // not used variables + case material_pt_state::elastic_modulii: + break; + case material_pt_state::shear_modulii: + break; + case material_pt_state::poisson_ratios: + break; + case material_pt_state::heat_flux: + break; + } // end switch + }// end for over mat point state + + + } // end of function + + ///////////////////////////////////////////////////////////////////////////// + /// + /// \fn concatenate_nodal_fields + /// + /// \brief A function to calculate the average of elem fields + /// + /// + /// \param Node a struct containing the material point state arrays + /// \param elem_scalar_fields the scalar fields + /// \param elem_tensor_fields the tensor fields + /// \param elem_in_mat_elem a listing of the element ids the material resides in + /// \param output_node_states a std::vector of enums specifying the model + /// \param num_mat_elems the number of elements the material resides in + /// \param mat_id the index for the material + /// + ///////////////////////////////////////////////////////////////////////////// + void concatenate_nodal_fields(const node_t& Node, + DCArrayKokkos& node_scalar_fields, + DCArrayKokkos& node_vector_fields, + std::vector& output_node_states, + double dt, + const size_t num_nodes, + const size_t num_dims, + const int node_mass_id, + const int node_vel_id, + const int node_accel_id, + const int node_coord_id, + const int node_grad_level_set_id, + const int node_temp_id) + { + for (auto field : output_node_states){ + switch(field){ + // scalars + case node_state::mass: + + FOR_ALL(node_gid, 0, num_nodes, { + node_scalar_fields(node_mass_id, node_gid) = Node.mass(node_gid); + }); + + break; + case node_state::temp: + FOR_ALL(node_gid, 0, num_nodes, { + node_scalar_fields(node_temp_id, node_gid) = Node.temp(node_gid); + }); + + break; + + // vector fields + + case node_state::coords: + + FOR_ALL(node_gid, 0, num_nodes, { + + node_vector_fields(node_coord_id, node_gid, 0) = Node.coords(node_gid, 0); + node_vector_fields(node_coord_id, node_gid, 1) = Node.coords(node_gid, 1); + if (num_dims == 2) { + node_vector_fields(node_coord_id, node_gid, 2) = 0.0; + } + else{ + node_vector_fields(node_coord_id, node_coord_id, 2) = Node.coords(node_gid, 2); + } // end if + + }); // end parallel for + + break; + case node_state::velocity: + + FOR_ALL(node_gid, 0, num_nodes, { + + // velocity, var is node_vel_id + node_vector_fields(node_vel_id, node_gid, 0) = Node.vel(node_gid, 0); + node_vector_fields(node_vel_id, node_gid, 1) = Node.vel(node_gid, 1); + if (num_dims == 2) { + node_vector_fields(node_vel_id, node_gid, 2) = 0.0; + } + else{ + node_vector_fields(node_vel_id, node_gid, 2) = Node.vel(node_gid, 2); + } // end if + + // accellerate, var is node_accel_id + node_vector_fields(node_accel_id, node_gid, 0) = (Node.vel(node_gid, 0) - Node.vel_n0(node_gid, 0))/dt; + node_vector_fields(node_accel_id, node_gid, 1) = (Node.vel(node_gid, 1) - Node.vel_n0(node_gid, 1))/dt; + if (num_dims == 2) { + node_vector_fields(node_accel_id, node_gid, 2) = 0.0; + } + else{ + node_vector_fields(node_accel_id, node_gid, 2) = (Node.vel(node_gid, 2) - Node.vel_n0(node_gid, 2))/dt; + } // end if + + }); // end parallel for + + break; + + + case node_state::gradient_level_set: + + FOR_ALL(node_gid, 0, num_nodes, { + + // velocity, var is node_vel_id + node_vector_fields(node_grad_level_set_id, node_gid, 0) = Node.gradient_level_set(node_gid, 0); + node_vector_fields(node_grad_level_set_id, node_gid, 1) = Node.gradient_level_set(node_gid, 1); + if (num_dims == 2) { + node_vector_fields(node_grad_level_set_id, node_gid, 2) = 0.0; + } + else{ + node_vector_fields(node_grad_level_set_id, node_gid, 2) = Node.gradient_level_set(node_gid, 2); + } // end if + + }); // end parallel for + + break; + + // -- not used vars + case node_state::force: + break; + + // heat transer vars + case node_state::heat_transfer: + break; + // tensors + } // end switch + } // end for over + + + + } // end function + + ///////////////////////////////////////////////////////////////////////////// + /// + /// \fn write_vtu + /// + /// \brief Writes a vtu ASCII output file + /// + /// \param Simulation mesh + /// \param State data + /// \param Simulation parameters + /// \param current time value + /// \param Vector of all graphics output times + /// + ///////////////////////////////////////////////////////////////////////////// + void write_vtu( + const ViewCArray& node_coords_host, + const ViewCArray& nodes_in_elem_host, + const DCArrayKokkos& elem_scalar_fields, + const DCArrayKokkos& elem_tensor_fields, + const DCArrayKokkos& node_scalar_fields, + const DCArrayKokkos& node_vector_fields, + const std::vector& elem_scalar_var_names, + const std::vector& elem_tensor_var_names, + const std::vector& node_scalar_var_names, + const std::vector& node_vector_var_names, + const std::string partname, + const int graphics_id, + const size_t num_nodes, + const size_t num_elems, + const size_t num_nodes_in_elem, + const int Pn_order, + const size_t num_dims, + const size_t solver_id + ) + { + FILE* out[20]; // the output files that are written to + char filename[100]; // char string + int max_len = sizeof filename; + int str_output_len; + + const size_t num_elem_scalar_vars = elem_scalar_var_names.size(); + const size_t num_elem_tensor_vars = elem_tensor_var_names.size(); + + const size_t num_node_scalar_vars = node_scalar_var_names.size(); + const size_t num_node_vector_vars = node_vector_var_names.size(); + + + // create filename + str_output_len = snprintf(filename, max_len, "vtk/data/Fierro.solver%zu.%s.%05d.vtu", + solver_id, partname.c_str(), graphics_id); + + if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); } + // mesh file + + out[0] = fopen(filename, "w"); + + fprintf(out[0], "\n"); + fprintf(out[0], "\n"); + fprintf(out[0], " \n"); + fprintf(out[0], " \n", num_nodes, num_elems); + + /* + --------------------------------------------------------------------------- + Write the mesh points + --------------------------------------------------------------------------- + */ + fprintf(out[0], "\n"); + fprintf(out[0], " \n"); + fprintf(out[0], " \n"); + fprintf(out[0], " \n"); + + // write all components of the point coordinates + for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) { + double coord_z = 0.0; + if(num_dims==3){ + coord_z = node_coords_host(node_gid, 2); + } + fprintf(out[0], + " %f %f %f\n", + node_coords_host(node_gid, 0), + node_coords_host(node_gid, 1), + coord_z); + } // end for + fprintf(out[0], " \n"); + fprintf(out[0], " \n"); + + /* + --------------------------------------------------------------------------- + Write the elems + --------------------------------------------------------------------------- + */ + fprintf(out[0], "\n"); + fprintf(out[0], " \n"); + fprintf(out[0], " \n"); + fprintf(out[0], " \n"); + + // WARNING: look into high-order Pn 2D elements with paraview + int Pn_order_z = 0; + if (num_dims == 3){ + Pn_order_z = Pn_order; + } + int order[3] = {Pn_order, Pn_order, Pn_order_z}; + + // const int num_1D_points = Pn_order+1; + + // write all global point numbers for this elem + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { + fprintf(out[0], " "); // adding indentation before printing nodes in element + if (num_dims==3 && Pn_order>1){ + for (int k = 0; k <= Pn_order_z; k++) { + for (int j = 0; j <= Pn_order; j++) { + for (int i = 0; i <= Pn_order; i++) { + size_t node_lid = PointIndexFromIJK(i, j, k, order); + fprintf(out[0], "%lu ", nodes_in_elem_host(elem_gid, node_lid)); + } + } + } // end for + } + else if (num_dims == 3 && Pn_order == 1){ + // 3D linear hexahedral elements + for (int node_lid = 0; node_lid < 8; node_lid++) { + fprintf(out[0], "%lu ", nodes_in_elem_host(elem_gid, node_lid)); + } // end for + } + else if (num_dims == 2){ + // 2D linear is the only supported option + for (int node_lid = 0; node_lid < 4; node_lid++) { + fprintf(out[0], "%lu ", nodes_in_elem_host(elem_gid, node_lid)); + } // end for + } + else { + std::cout << "ERROR: outputs failed, dimensions and element types are not compatible \n"; + } // end if + fprintf(out[0], "\n"); + } // end for + fprintf(out[0], " \n"); + + // Write the element offsets + fprintf(out[0], " \n"); + size_t count=0; + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { + count += num_nodes_in_elem; + fprintf(out[0], " %lu\n", count); // num points in this elem + all others before it + } // end for + fprintf(out[0], " \n"); + + + // Write the element types + fprintf(out[0], " \n"); + // ---- + // linear element types + // VTK_PIXEL = 8, linear 2D quad with i,j,k indexing (future format for 2D solver) + // VTK_Quad = 9, linear 2D quad with ensight index ordering (current 2D rz convention) + // VTK_VOXEL = 11, linear 3D hex with i,j,k indexing (current format) + // arbitrary order types + // VTK_LAGRANGE_QUADRILATERAL = 70, use this type when a 2D high-order scheme exists + // VTK_LAGRANGE_HEXAHEDRON: 72, this is the current 3D high-order + // VTK_HIGHER_ORDER_HEXAHEDRON: 67 + // VTK_BIQUADRATIC_QUADRATIC_HEXAHEDRON = 33 + // element types: https://vtk.org/doc/nightly/html/vtkCellType_8h_source.html + // element types: https://kitware.github.io/vtk-js/api/Common_DataModel_CellTypes.html + // vtk format: https://www.kitware.com//modeling-arbitrary-order-lagrange-finite-elements-in-the-visualization-toolkit/ + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { + if (num_dims==3 && Pn_order>1){ + fprintf(out[0], " %d \n", 72); + } + else if (num_dims == 3 && Pn_order == 1){ + // 3D linear hex + fprintf(out[0], " %d \n", 11); + } + else { + // 2D ensight mesh ordering + fprintf(out[0], " %d \n", 9); + } + } + fprintf(out[0], " \n"); + fprintf(out[0], " \n"); + + + /* + --------------------------------------------------------------------------- + Write the nodal variables to file + --------------------------------------------------------------------------- + */ + // vtk vector vars = (position, velocity) + fprintf(out[0], "\n"); + fprintf(out[0], " \n"); + if(num_node_vector_vars >0 || num_node_scalar_vars>0){ + + fprintf(out[0], " \n"); + + // node vectors + for (int a_var = 0; a_var < num_node_vector_vars; a_var++) { + fprintf(out[0], " \n", node_vector_var_names[a_var].c_str()); + + for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) { + fprintf(out[0], " %f %f %f\n", + node_vector_fields.host(a_var, node_gid, 0), + node_vector_fields.host(a_var, node_gid, 1), + node_vector_fields.host(a_var, node_gid, 2)); + } // end for nodes + fprintf(out[0], " \n"); + + } // end for vec_vars + + + // node scalar vars + for (int a_var = 0; a_var < num_node_scalar_vars; a_var++) { + fprintf(out[0], " \n", node_scalar_var_names[a_var].c_str()); + for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) { + fprintf(out[0], " %f\n", node_scalar_fields.host(a_var, node_gid)); + } // end for nodes + fprintf(out[0], " \n"); + } // end for vec_vars + + fprintf(out[0], " \n"); + + } // end if + + /* + --------------------------------------------------------------------------- + Write the elem variables to file + --------------------------------------------------------------------------- + */ + fprintf(out[0], "\n"); + fprintf(out[0], " \n"); + if(num_elem_scalar_vars >0 || num_elem_tensor_vars>0){ + + fprintf(out[0], " \n"); + + for (int a_var = 0; a_var < num_elem_scalar_vars; a_var++) { + + fprintf(out[0], " \n", elem_scalar_var_names[a_var].c_str()); // the 1 is number of scalar components [1:4] + + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { + fprintf(out[0], " %f\n", elem_scalar_fields.host(a_var, elem_gid)); + } // end for elem + fprintf(out[0], " \n"); + } // end for elem scalar_vars + + + // tensors + for (int a_var = 0; a_var < num_elem_tensor_vars; a_var++) { + fprintf(out[0], " \n", elem_tensor_var_names[a_var].c_str()); // the 1 is number of scalar components [1:4] + + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { + // note: paraview is row-major, CArray convention + // Txx Txy Txz Tyx Tyy Tyz Tzx Tzy Tzz + for (size_t i=0; i<3; i++){ + for(size_t j=0; j<3; j++){ + fprintf(out[0], " %f ", elem_tensor_fields.host(a_var, elem_gid, i, j)); + } // end j + } // end i + } // end for elem + fprintf(out[0], "\n"); + fprintf(out[0], " \n"); + } // end for elem scalar_vars + + fprintf(out[0], " \n"); + } // end if + + // end of the vtu file + fprintf(out[0], " \n"); + fprintf(out[0], " \n"); + fprintf(out[0], "\n"); + + //----------------- + // close the vtu file for element fields + //----------------- + fclose(out[0]); + + } // end write vtu + + + ///////////////////////////////////////////////////////////////////////////// + /// + /// \fn write_pvd + /// + /// \brief Writes a pvd ASCII output file for the element and nodal fields + /// + /// \param Vector of all graphics output times + /// \param element average field names + /// \param current time value + /// \param graphics index + /// + ///////////////////////////////////////////////////////////////////////////// + void write_pvd(CArray& graphics_times, + double time_value, + int graphics_id, + const size_t solver_id){ + + FILE* out[20]; // the output files that are written to + char filename[100]; // char string + int max_len = sizeof filename; + int str_output_len; + + // Write time series metadata + str_output_len = snprintf(filename, max_len, "vtk/Fierro.solver%zu.pvd", solver_id); + + if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); } + // mesh file + + out[0] = fopen(filename, "w"); + + fprintf(out[0], "\n"); + fprintf(out[0], "\n"); + fprintf(out[0], " \n"); + + for (int i = 0; i <= graphics_id; i++) { + fprintf(out[0], " \n", + graphics_times(i), solver_id, i, graphics_times(i) ); + //fprintf(out[0], " \n", + // i, solver_id, i, graphics_times(i) ); + } + + fprintf(out[0], " \n"); + fprintf(out[0], ""); + + fclose(out[0]); + + } // end pvd + + + ///////////////////////////////////////////////////////////////////////////// + /// + /// \fn write_vtm + /// + /// \brief Writes a vtm ASCII output file for all fields -- mesh and material + /// + /// \param Vector of all graphics output times + /// \param element average field names + /// \param current time value + /// \param graphics index + /// + ///////////////////////////////////////////////////////////////////////////// + void write_vtm(CArray& graphics_times, + const std::string& elem_part_name, + const std::string& mat_part_name, + double time_value, + int graphics_id, + int num_mats, + bool write_mesh_state, + bool write_mat_pt_state, + const size_t solver_id) + { + // loop over all the files that were written + for(int file_id=0; file_id<=graphics_id; file_id++){ + + FILE* out[20]; // the output files that are written to + char filename[100]; // char string + int max_len = sizeof filename; + int str_output_len; + + + // Write time series metadata to the data file + str_output_len = snprintf(filename, max_len, "vtk/data/Fierro.solver%zu.%05d.vtm", solver_id, file_id); + + if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); } + // mesh file + + out[0] = fopen(filename, "w"); + + fprintf(out[0], "\n"); + fprintf(out[0], "\n"); + fprintf(out[0], " \n"); + + + // Average mesh fields -- node and elem state written + size_t block_id = 0; // this will need to be incremented based on the number of mesh fields written + if (write_mesh_state){ + fprintf(out[0], " \n", block_id); + { + block_id++; // increment block id for material outputs that follow the element avg block + + // elem and nodal fields are in this file + fprintf(out[0], " \n"); + fprintf(out[0], " \n", + file_id, solver_id, elem_part_name.c_str(), file_id, graphics_times(file_id) ); + fprintf(out[0], " \n"); + + // add other Mesh average output Pieces here + } + fprintf(out[0], " \n"); + } // end if write elem and node state is true + + // note: the block_id was incremented if an element average field output was made + if (write_mat_pt_state){ + fprintf(out[0], " \n", block_id); + for (size_t mat_id=0; mat_id\n", mat_id, mat_id); + fprintf(out[0], " \n", + file_id, solver_id, mat_part_name.c_str(), mat_id, file_id, graphics_times(file_id) ); + fprintf(out[0], " \n"); + + } // end for loop mat_id + fprintf(out[0], " \n"); + } // end if write mat satte is true + + // done writing the files to be read by the vtm file + fprintf(out[0], " \n"); + fprintf(out[0], ""); + + fclose(out[0]); + + } // end for file_id + + } // end vtm + + + ///////////////////////////////////////////////////////////////////////////// + /// + /// \fn build_material_elem_node_lists + /// + /// \brief Creates elems and nodes for a unique mesh of a material (i.e, a part) + /// + /// \param Simulation mesh + /// \param State node data + /// \param Material node coordinates + /// \param Material nodes in the material element + /// \param Material to mesh map for elements + /// \param number of material nodes + /// \param number of material elements + /// \param number of nodes in the element + /// \param number of dimensions + /// + ///////////////////////////////////////////////////////////////////////////// + void build_material_elem_node_lists( + const Mesh_t& mesh, + const DCArrayKokkos& state_node_coords, + DCArrayKokkos& mat_node_coords, + DCArrayKokkos & mat_nodes_in_mat_elem, + const DRaggedRightArrayKokkos& elem_in_mat_elem, + const size_t mat_id, + size_t& num_mat_nodes, + const size_t num_mat_elems, + const size_t num_nodes_in_elem, + const size_t num_dims) + { + + // helper arrays + DCArrayKokkos dummy_counter(mesh.num_nodes, "dummy_counter"); + DCArrayKokkos access_mat_node_gids(mesh.num_nodes, "access_mat_node_gids"); + dummy_counter.set_values(0); + + // tag and count the number of nodes in this part + FOR_ALL (mat_elem_sid, 0, num_mat_elems, { + // get elem gid + size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid); // WARNING not GPU compatible + + // parallel loop over the nodes in the element + for(size_t node_lid=0; node_lid0 + + } // end for nodes in element + + }); // end parallel for + Kokkos::fence(); + dummy_counter.update_host(); + + // loop opperation is not thread safe, must be run serially + size_t mat_node_gid = 0; + for(size_t node_gid = 0; node_gid0){ + mat_node_coords.host(mat_node_gid, 0) = state_node_coords.host(node_gid, 0); + mat_node_coords.host(mat_node_gid, 1) = state_node_coords.host(node_gid, 1); + if (num_dims == 3){ + mat_node_coords.host(mat_node_gid, 2) = state_node_coords.host(node_gid, 2); + } // end if on dims + + access_mat_node_gids.host(node_gid) = mat_node_gid; // the part node id + + mat_node_gid ++; + + dummy_counter.host(node_gid) = 0; // set counter to zero, it was accounted for + } // end if this node is on the part + + } // end loop over all mesh nodes + mat_node_coords.update_device(); + access_mat_node_gids.update_device(); + dummy_counter.update_device(); + Kokkos::fence(); + + // save the number of nodes defining the material region, i.e., the part + num_mat_nodes = mat_node_gid; + + // save the new node id's + FOR_ALL (mat_elem_sid, 0, num_mat_elems, { + // get elem gid + size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid); + + // parallel loop over the nodes in the element + for(size_t node_lid=0; node_lid graphics_times, + std::vector node_states, + std::vector gauss_pt_states, + std::vector material_pt_states) + { + // WARNING WARNING WARNING: + // This currently assumes the gauss and material point IDs are the same as the element ID + // This will need to be updated for high order methods + + // Update host data + // ---- Update host data ---- + size_t num_mats = State.MaterialPoints.num_material_points.size(); + + State.MaterialPoints.den.update_host(); + State.MaterialPoints.pres.update_host(); + State.MaterialPoints.stress.update_host(); + State.MaterialPoints.sspd.update_host(); + State.MaterialPoints.sie.update_host(); + State.MaterialPoints.mass.update_host(); + + State.GaussPoints.vol.update_host(); + + State.node.coords.update_host(); + State.node.vel.update_host(); + State.node.mass.update_host(); + + Kokkos::fence(); + + struct stat st; + + if (stat("state", &st) != 0) { + system("mkdir state"); + } + + size_t num_dims = mesh.num_dims; + + // --------------------------------------------------------------------------- + // Setup of file and directory for exporting + // --------------------------------------------------------------------------- + + // output file + FILE* out_elem_state; // element average state + char filename[128]; + + int max_len = sizeof filename; + + snprintf(filename, max_len, "state/mat_pt_state_t_%6.4e.txt", time_value); + + // output files + out_elem_state = fopen(filename, "w"); + + // write state dump + fprintf(out_elem_state, "# state dump file\n"); + fprintf(out_elem_state, "# x y z radius_2D radius_3D den pres sie sspd vol mass \n"); + + // write out values for the elem + for (size_t mat_id = 0; mat_id < num_mats; mat_id++) { + + size_t num_mat_elems = State.MaterialToMeshMaps.num_mat_elems.host(mat_id); + + for (size_t mat_elem_sid = 0; mat_elem_sid < num_mat_elems; mat_elem_sid++) + { + + const size_t elem_gid = State.MaterialToMeshMaps.elem_in_mat_elem.host(mat_id, mat_elem_sid); + + double elem_coords[3]; + elem_coords[0] = 0.0; + elem_coords[1] = 0.0; + elem_coords[2] = 0.0; + + // get the coordinates of the element center + for (size_t node_lid = 0; node_lid < mesh.num_nodes_in_elem; node_lid++) { + + elem_coords[0] += State.node.coords.host(mesh.nodes_in_elem.host(elem_gid, node_lid), 0); + elem_coords[1] += State.node.coords.host(mesh.nodes_in_elem.host(elem_gid, node_lid), 1); + if (num_dims == 3) { + elem_coords[2] += State.node.coords.host(mesh.nodes_in_elem.host(elem_gid, node_lid), 2); + } + else{ + elem_coords[2] = 0.0; + } + } // end loop over nodes in element + + elem_coords[0] = elem_coords[0] / ((double)mesh.num_nodes_in_elem); + elem_coords[1] = elem_coords[1] / ((double)mesh.num_nodes_in_elem); + elem_coords[2] = elem_coords[2] / ((double)mesh.num_nodes_in_elem); + + double rad2 = sqrt(elem_coords[0] * elem_coords[0] + + elem_coords[1] * elem_coords[1]); + + double rad3 = sqrt(elem_coords[0] * elem_coords[0] + + elem_coords[1] * elem_coords[1] + + elem_coords[2] * elem_coords[2]); + + + fprintf(out_elem_state, "%4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t \n", + elem_coords[0], + elem_coords[1], + elem_coords[2], + rad2, + rad3, + State.MaterialPoints.den.host(mat_id, mat_elem_sid), + State.MaterialPoints.pres.host(mat_id, mat_elem_sid), + State.MaterialPoints.sie.host(mat_id, mat_elem_sid), + State.MaterialPoints.sspd.host(mat_id, mat_elem_sid), + State.GaussPoints.vol.host(elem_gid), + State.MaterialPoints.mass.host(mat_id, mat_elem_sid) ); + + } // end for elements + + } // end for materials + fclose(out_elem_state); + + + + // printing nodal state + + FILE* out_point_state; // element average state + + snprintf(filename, max_len, "state/node_state_t_%6.4e.txt", time_value); + + // output files + out_point_state = fopen(filename, "w"); + + // write state dump + fprintf(out_point_state, "# state node dump file\n"); + fprintf(out_point_state, "# x y z radius_2D radius_3D vel_x vel_y vel_z speed ||err_v_dot_r|| \n"); + + // get the coordinates of the node + for (size_t node_gid = 0; node_gid < mesh.num_nodes; node_gid++) { + + double node_coords[3]; + + node_coords[0] = State.node.coords.host(node_gid, 0); + node_coords[1] = State.node.coords.host(node_gid, 1); + if (num_dims == 3) { + node_coords[2] = State.node.coords.host(node_gid, 2); + } + else{ + node_coords[2] = 0.0; + } + + double rad2 = sqrt(node_coords[0] * node_coords[0] + + node_coords[1] * node_coords[1]); + double rad3 = sqrt(node_coords[0] * node_coords[0] + + node_coords[1] * node_coords[1] + + node_coords[2] * node_coords[2]); + + double node_vel[3]; + + node_vel[0] = State.node.vel.host(node_gid, 0); + node_vel[1] = State.node.vel.host(node_gid, 1); + if (num_dims == 3) { + node_vel[2] = State.node.vel.host(node_gid, 2); + } + else{ + node_vel[2] = 0.0; + } + + double speed = sqrt(node_vel[0] * node_vel[0] + + node_vel[1] * node_vel[1] + + node_vel[2] * node_vel[2]); + + + + // looking at perfect radial motion + double unit_r_vec[2]; + unit_r_vec[0] = node_coords[0]/rad2; + unit_r_vec[1] = node_coords[1]/rad2; + + //the radial motion + double v_dot_r = node_vel[0] * unit_r_vec[0] + + node_vel[1] * unit_r_vec[1]; + + + double err_v_dot_r[3]; + err_v_dot_r[0] = node_vel[0]-unit_r_vec[0]*v_dot_r; + err_v_dot_r[1] = node_vel[1]-unit_r_vec[1]*v_dot_r; + + double mag_err_v_dot_r = sqrt(err_v_dot_r[0]*err_v_dot_r[0] + err_v_dot_r[1]*err_v_dot_r[1]); + + fprintf(out_point_state, "%4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t \n", + node_coords[0], + node_coords[1], + node_coords[2], + rad2, + rad3, + node_vel[0], + node_vel[1], + node_vel[2], + speed, + mag_err_v_dot_r); + + + } // end loop over nodes in element + + + fclose(out_point_state); + + + return; + } // end of state output +}; // end class + +#endif // end Header Guard \ No newline at end of file From c2e3ce57b06da6752b3080b95ddf1bffdfe3dd46 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Wed, 22 Oct 2025 15:12:30 -0500 Subject: [PATCH 02/52] ENH: Adding mesh decomposition example WIP --- .gitignore | 3 +- examples/CMakeLists.txt | 143 +- examples/mesh_decomp/CMakeLists.txt | 23 +- examples/mesh_decomp/install_ptscotch.sh | 6 +- examples/mesh_decomp/mesh.h | 1 - examples/mesh_decomp/mesh_decomp.cpp | 57 +- examples/mesh_decomp/mesh_inputs.h | 141 + examples/mesh_decomp/mesh_io.h | 4898 +--------------------- examples/mesh_decomp/state.h | 139 + scripts/build-matar.sh | 2 +- 10 files changed, 514 insertions(+), 4899 deletions(-) create mode 100644 examples/mesh_decomp/mesh_inputs.h create mode 100644 examples/mesh_decomp/state.h diff --git a/.gitignore b/.gitignore index fbdfa9d3..87400105 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ heffte/ docs_doxygen/ docs_sphinx/ tutorial/getting_started/Example0/build_* -tutorial/getting_started/Example0/install* \ No newline at end of file +tutorial/getting_started/Example0/install* +examples/mesh_decomp/lib/* \ No newline at end of file diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index affcd031..e32ddb2d 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -88,108 +88,111 @@ if (KOKKOS) add_definitions(-DHAVE_THREADS=1) endif() - add_executable(testsetval test_set_values.cpp) - target_link_libraries(testsetval ${LINKING_LIBRARIES}) + # add_executable(testsetval test_set_values.cpp) + # target_link_libraries(testsetval ${LINKING_LIBRARIES}) - add_executable(mtestkokkos main_kokkos.cpp) - target_link_libraries(mtestkokkos ${LINKING_LIBRARIES}) + # add_executable(mtestkokkos main_kokkos.cpp) + # target_link_libraries(mtestkokkos ${LINKING_LIBRARIES}) - add_executable(drrak_test test_drrak.cpp) - target_link_libraries(drrak_test ${LINKING_LIBRARIES}) + # add_executable(drrak_test test_drrak.cpp) + # target_link_libraries(drrak_test ${LINKING_LIBRARIES}) - add_executable(test_kokkos_for kokkos_for.cpp) - target_link_libraries(test_kokkos_for ${LINKING_LIBRARIES}) + # add_executable(test_kokkos_for kokkos_for.cpp) + # target_link_libraries(test_kokkos_for ${LINKING_LIBRARIES}) - add_executable(test_dual_types test_dual_types.cpp) - target_link_libraries(test_dual_types ${LINKING_LIBRARIES}) + # add_executable(test_dual_types test_dual_types.cpp) + # target_link_libraries(test_dual_types ${LINKING_LIBRARIES}) - add_executable(kokkos_csr CSRKokkos.cpp) - target_link_libraries(kokkos_csr ${LINKING_LIBRARIES}) + # add_executable(kokkos_csr CSRKokkos.cpp) + # target_link_libraries(kokkos_csr ${LINKING_LIBRARIES}) - add_executable(kokkos_csc CSCKokkos.cpp) - target_link_libraries(kokkos_csc ${LINKING_LIBRARIES}) + # add_executable(kokkos_csc CSCKokkos.cpp) + # target_link_libraries(kokkos_csc ${LINKING_LIBRARIES}) - add_executable(mtr_kokkos-simple mtr-kokkos-simple.cpp) - target_link_libraries(mtr_kokkos-simple ${LINKING_LIBRARIES}) + # add_executable(mtr_kokkos-simple mtr-kokkos-simple.cpp) + # target_link_libraries(mtr_kokkos-simple ${LINKING_LIBRARIES}) - add_executable(annkokkos ann_kokkos.cpp) - target_link_libraries(annkokkos ${LINKING_LIBRARIES}) + # add_executable(annkokkos ann_kokkos.cpp) + # target_link_libraries(annkokkos ${LINKING_LIBRARIES}) - add_executable(annkokkos_compare ann_kokkos_compare.cpp) - target_link_libraries(annkokkos_compare ${LINKING_LIBRARIES}) + # add_executable(annkokkos_compare ann_kokkos_compare.cpp) + # target_link_libraries(annkokkos_compare ${LINKING_LIBRARIES}) - #add_executable(ompperftest ompperftest.cpp) - #target_link_libraries(ompperftest ${LINKING_LIBRARIES}) + # #add_executable(ompperftest ompperftest.cpp) + # #target_link_libraries(ompperftest ${LINKING_LIBRARIES}) - add_executable(lu_test test_lu_solve.cpp) - target_link_libraries(lu_test ${LINKING_LIBRARIES}) + # add_executable(lu_test test_lu_solve.cpp) + # target_link_libraries(lu_test ${LINKING_LIBRARIES}) - add_executable(qr_test test_qr_solve.cpp) - target_link_libraries(qr_test ${LINKING_LIBRARIES}) + # add_executable(qr_test test_qr_solve.cpp) + # target_link_libraries(qr_test ${LINKING_LIBRARIES}) - if (Matar_ENABLE_TRILINOS) - add_executable(anndistributed ann_distributed.cpp) - target_link_libraries(anndistributed ${LINKING_LIBRARIES}) + # if (Matar_ENABLE_TRILINOS) + # add_executable(anndistributed ann_distributed.cpp) + # target_link_libraries(anndistributed ${LINKING_LIBRARIES}) - add_executable(anndistributed_crs ann_distributed_crs.cpp) - target_link_libraries(anndistributed_crs ${LINKING_LIBRARIES}) + # add_executable(anndistributed_crs ann_distributed_crs.cpp) + # target_link_libraries(anndistributed_crs ${LINKING_LIBRARIES}) - add_executable(test_tpetra_farray test_tpetra_farray.cpp) - target_link_libraries(test_tpetra_farray ${LINKING_LIBRARIES}) + # add_executable(test_tpetra_farray test_tpetra_farray.cpp) + # target_link_libraries(test_tpetra_farray ${LINKING_LIBRARIES}) - add_executable(test_tpetra_carray test_tpetra_carray.cpp) - target_link_libraries(test_tpetra_carray ${LINKING_LIBRARIES}) + # add_executable(test_tpetra_carray test_tpetra_carray.cpp) + # target_link_libraries(test_tpetra_carray ${LINKING_LIBRARIES}) - add_executable(test_tpetra_crs test_tpetra_crs.cpp) - target_link_libraries(test_tpetra_crs ${LINKING_LIBRARIES}) + # add_executable(test_tpetra_crs test_tpetra_crs.cpp) + # target_link_libraries(test_tpetra_crs ${LINKING_LIBRARIES}) - add_executable(test_tpetra_mesh test_tpetra_mesh.cpp) - target_link_libraries(test_tpetra_mesh ${LINKING_LIBRARIES}) - endif() + # add_executable(test_tpetra_mesh test_tpetra_mesh.cpp) + # target_link_libraries(test_tpetra_mesh ${LINKING_LIBRARIES}) + # endif() - if (OPENMP) - add_executable(parallel_hello_world parallel_hello_world.cpp) - target_link_libraries(parallel_hello_world ${LINKING_LIBRARIES}) - endif() + # if (OPENMP) + # add_executable(parallel_hello_world parallel_hello_world.cpp) + # target_link_libraries(parallel_hello_world ${LINKING_LIBRARIES}) + # endif() - if (MPI) - include_directories(laplaceMPI) - add_subdirectory(laplaceMPI) - endif() + # if (MPI) + # include_directories(laplaceMPI) + # add_subdirectory(laplaceMPI) + # endif() endif() -### HIP Linking error, will add back in after fixed -if (NOT HIP) - include_directories(virtualFcnKokkos) - add_subdirectory(virtualFcnKokkos) -endif() +# ### HIP Linking error, will add back in after fixed +# if (NOT HIP) +# include_directories(virtualFcnKokkos) +# add_subdirectory(virtualFcnKokkos) +# endif() + +# # In testing, not working +# #include_directories(gArrayofgArrays) +# #add_subdirectory(gArrayofgArrays) -# In testing, not working -#include_directories(gArrayofgArrays) -#add_subdirectory(gArrayofgArrays) +# include_directories(virtualFcnMATAR) +# add_subdirectory(virtualFcnMATAR) -include_directories(virtualFcnMATAR) -add_subdirectory(virtualFcnMATAR) +# include_directories(laplace) +# add_subdirectory(laplace) -include_directories(laplace) -add_subdirectory(laplace) +# include_directories(halfspace_cooling) +# add_subdirectory(halfspace_cooling) -include_directories(halfspace_cooling) -add_subdirectory(halfspace_cooling) +# include_directories(watt-graph) +# add_subdirectory(watt-graph) -include_directories(watt-graph) -add_subdirectory(watt-graph) +# #include_directories(matar_fortran) +# #add_subdirectory(matar_fortran) -#include_directories(matar_fortran) -#add_subdirectory(matar_fortran) +# include_directories(sparsetests) +# add_subdirectory(sparsetests) -include_directories(sparsetests) -add_subdirectory(sparsetests) +# include_directories(test_rocm) +# add_subdirectory(test_rocm) -include_directories(test_rocm) -add_subdirectory(test_rocm) +include_directories(mesh_decomp) +add_subdirectory(mesh_decomp) #include_directories(phaseField/srcKokkosVerbose) #add_subdirectory(phaseField/srcKokkosVerbose) diff --git a/examples/mesh_decomp/CMakeLists.txt b/examples/mesh_decomp/CMakeLists.txt index 721859a8..b002a355 100644 --- a/examples/mesh_decomp/CMakeLists.txt +++ b/examples/mesh_decomp/CMakeLists.txt @@ -1,13 +1,32 @@ cmake_minimum_required(VERSION 3.1.3) +# Find MPI +find_package(MPI REQUIRED) + find_package(Matar REQUIRED) +execute_process( + COMMAND ${CMAKE_CURRENT_LIST_DIR}/install_ptscotch.sh + WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR} + RESULT_VARIABLE INSTALL_PTSCOTCH_RESULT +) + +if(NOT INSTALL_PTSCOTCH_RESULT EQUAL 0) + message(FATAL_ERROR "Failed to install PT-Scotch by running install_ptscotch.sh") +endif() + + if (KOKKOS) #find_package(Kokkos REQUIRED) #new - add_executable(mech_decomp mesh_decomp.cpp) + add_executable(mesh_decomp mesh_decomp.cpp) add_definitions(-DHAVE_KOKKOS=1) - target_link_libraries(mesh_decomp ${LINKING_LIBRARIES}) + # Add include directories for MPI and Scotch/PT-Scotch + target_include_directories(mesh_decomp PRIVATE ${MPI_CXX_INCLUDE_PATH} ${CMAKE_CURRENT_LIST_DIR}/lib/scotch/build/src/include) + + # Link libraries + target_link_libraries(mesh_decomp ${LINKING_LIBRARIES} MPI::MPI_CXX) + target_link_directories(mesh_decomp PRIVATE ${CMAKE_CURRENT_LIST_DIR}/lib/scotch/build/src/lib) endif() diff --git a/examples/mesh_decomp/install_ptscotch.sh b/examples/mesh_decomp/install_ptscotch.sh index 95ad7914..00d29df9 100755 --- a/examples/mesh_decomp/install_ptscotch.sh +++ b/examples/mesh_decomp/install_ptscotch.sh @@ -11,8 +11,10 @@ LIB_DIR="lib" # echo "Installing Scotch and PT-Scotch to ${INSTALL_PREFIX}" -# Create lib directory -mkdir -p "${LIB_DIR}" +# Create lib directory if it doesn't exist +if [ ! -d "${LIB_DIR}" ]; then + mkdir -p "${LIB_DIR}" +fi cd ${LIB_DIR} # Clone and build Scotch echo "Cloning Scotch..." diff --git a/examples/mesh_decomp/mesh.h b/examples/mesh_decomp/mesh.h index 599cb77d..9a7140a3 100644 --- a/examples/mesh_decomp/mesh.h +++ b/examples/mesh_decomp/mesh.h @@ -36,7 +36,6 @@ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "matar.h" #include "state.h" -#include "ref_elem.h" #include #define PI 3.141592653589793 diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp index dd26b631..595ab4e0 100644 --- a/examples/mesh_decomp/mesh_decomp.cpp +++ b/examples/mesh_decomp/mesh_decomp.cpp @@ -5,28 +5,67 @@ #include #include + +#include "mesh.h" +#include "state.h" +#include "mesh_io.h" + // Include Scotch headers #include "scotch.h" #include "ptscotch.h" -struct initial_mesh_t { - int num_elems; // Number of elements + + +int main(int argc, char** argv) { + + MPI_Init(&argc, &argv); + MATAR_INITIALIZE(argc, argv); + { // MATAR scope + + int world_size; + int rank; + MPI_Comm_size(MPI_COMM_WORLD, &world_size); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + + + + // Create mesh, gauss points, and node data structures on each rank + Mesh_t mesh; + GaussPoint_t GaussPoints; + node_t node; + + + if (rank == 0) { + std::cout<<"Rank "< nodes_in_elem; // Nodes in an element - std::vector elems_in_elem; // Elements in an element + double origin[3] = {0.0, 0.0, 0.0}; + double length[3] = {1.0, 1.0, 1.0}; + int num_elems[3] = {10, 10, 10}; - std::vector verttab; // Start index in edgetab for each element (size num_elems+1) - std::vector edgetab; // Adjacency info: neighboring element indices -}; + std::cout<<"Initializing mesh"< +#include "matar.h" + +namespace mesh_input +{ +// source of the mesh +enum source +{ + none = 0, ///< No source given, should fail + generate = 1, ///< Create the mesh using the mesh builder + file = 2, ///< Read in the mesh from a file +}; + +// type of mesh to generate if source = generate +enum type +{ + Box = 0, // Create the mesh using the mesh builder + Polar = 1, // Create a polar 2D mesh +}; +} // end of namespace + +static std::map mesh_input_source_map +{ + { "generate", mesh_input::generate }, + { "file", mesh_input::file } +}; + +static std::map mesh_input_type_map +{ + { "box", mesh_input::Box }, + { "polar", mesh_input::Polar } +}; + +///////////////////////////////////////////////////////////////////////////// +/// +/// \struct mesh_input_t +/// +/// \brief Meshing related input parameters +/// +///////////////////////////////////////////////////////////////////////////// +struct mesh_input_t +{ + int num_dims = 3; ///< Number of dimensions for the mesh + mesh_input::source source = mesh_input::none; ///< Source of mesh, file or generate + std::string file_path = ""; ///< Absolute path of mesh file + mesh_input::type type; ///< Type of mesh to generate if + + double origin[3] = { 0.0, 0.0, 0.0 }; ///< Mesh origin for generating a mesh + double length[3] = { 0.0, 0.0, 0.0 }; ///< x,y,z length of generated mesh + size_t num_elems[3] = { 1, 1, 1 }; ///< Number of elements along x,y, z for generating a mesh. + + size_t p_order = 1; + + // WARNING, NOT YET PARSED + double inner_radius = 0.0; ///< Inner radius for generating 2D RZ mesh + double outer_radius = 1.0; ///< Outer radius for generating 2D RZ mesh + double starting_angle = 0.0; ///< Starting angle in degrees for 2D RZ mesh + double ending_angle = 90; ///< Ending angle in degrees for 2D RZ mesh + + int num_radial_elems = 10; ///< Number of elements in the radial direction for 2DRZ mesh + int num_angular_elems = 10; ///< Number of elements in the radial direction for 2DRZ mesh + + double scale_x = 1.0; ///< Scales mesh x coordinate dimensions + double scale_y = 1.0; ///< Scales mesh y coordinate dimensions + double scale_z = 1.0; ///< Scales mesh z coordinate dimensions + + DCArrayKokkos object_ids; ///< the object_ids in the vtu full mesh file (from exodus mesh) + +}; // mesh_input_t + +// ---------------------------------- +// valid inputs for mesh options +// ---------------------------------- +static std::vector str_mesh_inps +{ + "num_dims", + "source", + "file_path", + "type", + "origin", + "length", + "num_elems", + "polynomial_order", + "inner_radius", + "outer_radius", + "starting_angle", + "ending_angle", + "num_radial_elems", + "num_angular_elems", + "scale_x", + "scale_y", + "scale_z" +}; + +// ---------------------------------- +// required inputs for mesh options +// ---------------------------------- +static std::vector mesh_required_inps +{ + "source", + "num_dims", +}; + +#endif // end Header Guard \ No newline at end of file diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h index 03fee676..0c82ba9d 100644 --- a/examples/mesh_decomp/mesh_io.h +++ b/examples/mesh_decomp/mesh_io.h @@ -1,139 +1,21 @@ -/********************************************************************************************** -© 2020. Triad National Security, LLC. All rights reserved. -This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos -National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S. -Department of Energy/National Nuclear Security Administration. All rights in the program are -reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear -Security Administration. The Government is granted for itself and others acting on its behalf a -nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare -derivative works, distribute copies to the public, perform publicly and display publicly, and -to permit others to do so. -This program is open source under the BSD-3 License. -Redistribution and use in source and binary forms, with or without modification, are permitted -provided that the following conditions are met: -1. Redistributions of source code must retain the above copyright notice, this list of -conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright notice, this list of -conditions and the following disclaimer in the documentation and/or other materials -provided with the distribution. -3. Neither the name of the copyright holder nor the names of its contributors may be used -to endorse or promote products derived from this software without specific prior -written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR -CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; -OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, -WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR -OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF -ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -**********************************************************************************************/ -#ifndef FIERRO_IO_H -#define FIERRO_IO_H +#ifndef MESH_IO_H +#define MESH_IO_H #include "matar.h" #include "mesh.h" #include "state.h" -#include "simulation_parameters.h" -#include "region.h" -#include "string_utils.h" -#include -#include -#include -#include -#include -#include // for string pattern recoginition -#include -#include -#include -#include +using namespace mtr; -///////////////////////////////////////////////////////////////////////////// -/// -/// \fn get_id -/// -/// \brief This gives the index value of the point or the elem -/// -/// Assumes that the grid has an i,j,k structure -/// the elem = i + (j)*(num_points_i-1) + (k)*(num_points_i-1)*(num_points_j-1) -/// the point = i + (j)*num_points_i + (k)*num_points_i*num_points_j -/// -/// \param i index -/// \param j index -/// \param k index -/// \param Number of i indices -/// \param Number of j indices -/// -///////////////////////////////////////////////////////////////////////////// -inline int get_id(int i, int j, int k, int num_i, int num_j) -{ - return i + j * num_i + k * num_i * num_j; -} - -///////////////////////////////////////////////////////////////////////////// -/// -/// \fn PointIndexFromIJK -/// -/// \brief Given (i,j,k) coordinates within the Lagrange hex, return an -/// offset into the local connectivity (PointIds) array. The order parameter -/// must point to an array of 3 integers specifying the order along each -/// axis of the hexahedron. -/// -///////////////////////////////////////////////////////////////////////////// -inline int PointIndexFromIJK(int i, int j, int k, const int* order) -{ - bool ibdy = (i == 0 || i == order[0]); - bool jbdy = (j == 0 || j == order[1]); - bool kbdy = (k == 0 || k == order[2]); - // How many boundaries do we lie on at once? - int nbdy = (ibdy ? 1 : 0) + (jbdy ? 1 : 0) + (kbdy ? 1 : 0); - if (nbdy == 3) { // Vertex DOF - // ijk is a corner node. Return the proper index (somewhere in [0,7]): - return (i ? (j ? 2 : 1) : (j ? 3 : 0)) + (k ? 4 : 0); - } - int offset = 8; - if (nbdy == 2) { // Edge DOF - if (!ibdy) { // On i axis - return (i - 1) + (j ? order[0] - 1 + order[1] - 1 : 0) + (k ? 2 * (order[0] - 1 + order[1] - 1) : 0) + offset; - } - if (!jbdy) { // On j axis - return (j - 1) + (i ? order[0] - 1 : 2 * (order[0] - 1) + order[1] - 1) + (k ? 2 * (order[0] - 1 + order[1] - 1) : 0) + offset; - } - // !kbdy, On k axis - offset += 4 * (order[0] - 1) + 4 * (order[1] - 1); - return (k - 1) + (order[2] - 1) * (i ? (j ? 3 : 1) : (j ? 2 : 0)) + offset; - } - offset += 4 * (order[0] - 1 + order[1] - 1 + order[2] - 1); - if (nbdy == 1) { // Face DOF - if (ibdy) { // On i-normal face - return (j - 1) + ((order[1] - 1) * (k - 1)) + (i ? (order[1] - 1) * (order[2] - 1) : 0) + offset; - } - offset += 2 * (order[1] - 1) * (order[2] - 1); - if (jbdy) { // On j-normal face - return (i - 1) + ((order[0] - 1) * (k - 1)) + (j ? (order[2] - 1) * (order[0] - 1) : 0) + offset; - } - offset += 2 * (order[2] - 1) * (order[0] - 1); - // kbdy, On k-normal face - return (i - 1) + ((order[0] - 1) * (j - 1)) + (k ? (order[0] - 1) * (order[1] - 1) : 0) + offset; - } - - // nbdy == 0: Body DOF - offset += 2 * ( (order[1] - 1) * (order[2] - 1) + (order[2] - 1) * (order[0] - 1) + (order[0] - 1) * (order[1] - 1)); - return offset + (i - 1) + (order[0] - 1) * ( (j - 1) + (order[1] - 1) * ( (k - 1))); -} ///////////////////////////////////////////////////////////////////////////// /// -/// \fn get_id_device +/// \fn get_id /// /// \brief This gives the index value of the point or the elem /// @@ -148,4747 +30,137 @@ inline int PointIndexFromIJK(int i, int j, int k, const int* order) /// \param Number of j indices /// ///////////////////////////////////////////////////////////////////////////// -KOKKOS_INLINE_FUNCTION -int get_id_device(int i, int j, int k, int num_i, int num_j) +inline int get_id(int i, int j, int k, int num_i, int num_j) { return i + j * num_i + k * num_i * num_j; } - -//------- -// word is the field name e.g., Offsets, connectivity, etc. -// stop is the phrase to stop extracting values -template -inline bool extract_values_xml(T *values_xml, - const std::string& word, - const std::string& stop, - std::ifstream& in, - size_t& size) -{ - - bool found = false; - - std::string line; - - size_t i = 0; - - // Read the file line by line looking for specified word - while (std::getline(in, line)) { - - if (line.find(word) != std::string::npos) { // Check if the portion of the word is in the line - found = true; - } - if(found) { - - // loop over the lines in the file, extracting the values of the field corresponding to the word - while (std::getline(in, line)){ - - std::istringstream iss(line); // Create a stream from the line - - // extract the individual values from the stream - T value; - while (iss >> value) { - values_xml[i] = value; - i++; - } // end while - - if (line.find(stop) != std::string::npos) { // Check if the stop word is in the line - break; - } // end if - - } // end while - - if(found) break; - - } // end if found - - } // end while - - size = i; - - return found; - -} // end function - - -// find the number of points and number of cells in the mesh -inline bool extract_num_points_and_cells_xml(int& numberOfPoints, - int& numberOfCells, - std::ifstream& in) -{ - bool found = false; - - std::string line; - - - // Read the file line by line looking for NumberOfPoints - while (std::getline(in, line)) { - - std::string word = "NumberOfPoints="; // A portion of a word - - if (line.find(word) != std::string::npos) { // Check if the portion of the word is in the line - found = true; - } - if(found) { - // Define regex pattern to match the attributes and capture values - std::regex pattern(R"(NumberOfPoints=\"(\d+)\" NumberOfCells=\"(\d+)\")"); - std::smatch match; - - if (std::regex_search(line, match, pattern)) { - //std::cout << "Number of nodes in mesh file: " << match[1] << std::endl; - //std::cout << "Number of cells in mesh file: " << match[2] << std::endl; - - numberOfPoints = std::stoi(match[1].str()); - numberOfCells = std::stoi(match[2].str()); - - } else { - std::cout << "Error reading the number of points and cells in the mesh!" << std::endl; - } - - break; - } // end if - - } // end while - - return found; - -} // end function - - -// 8 = pixal i,j,k linear quad ording -// 9 = linear quad ensight ordering -// 11 = voxel i,j,k linear hex ording -// 12 = linear ensight hex ordering -// 72 = VTK_LAGRANGE_HEXAHEDRON -namespace element_types -{ - enum element_name - { - linear_quad_ijk = 8, - linear_quad = 9, - linear_hex_ijk = 11, - linear_hex = 12, - arbitrary_hex = 72 - }; -} - ///////////////////////////////////////////////////////////////////////////// /// -/// \class MeshReader +/// \fn build_3d_box /// -/// \brief Class for simplifying reading meshes +/// \brief Builds an unstructured 3D rectilinear mesh /// -/// This class contains the requisite functions required to read different -/// mesh formats. The idea is to set the mesh file name, and parse the -/// extension to decide which reader to use. Currently, only ensight .geo -/// files are supported. +/// \param Simulation mesh that is built +/// \param Element state data +/// \param Node state data +/// \param origin The origin of the mesh +/// \param length The length of the mesh +/// \param num_elems The number of elements in the mesh /// ///////////////////////////////////////////////////////////////////////////// -class MeshReader +void build_3d_box( + Mesh_t& mesh, + GaussPoint_t& GaussPoints, + node_t& node, + double origin[3], + double length[3], + int num_elems_dim[3]) { -private: - // Handy structs for parsing input meshes - struct Node { - int id; - double x, y, z; - }; - - struct Element { - int id; - std::vector connectivity; - }; - -public: - - char* mesh_file_ = NULL; - - MeshReader() {} // Simulation_Parameters& _simparam); - - ~MeshReader() = default; - - ///////////////////////////////////////////////////////////////////////////// - /// - /// \fn set_mesh_file - /// - /// \brief Sets the mesh file path for reading in a mesh - /// - /// \param Path to mesh file - /// - ///////////////////////////////////////////////////////////////////////////// - void set_mesh_file(char* MESH) - { - mesh_file_ = MESH; - } - - // Reads and initializes the mesh and geometric state entities - ///////////////////////////////////////////////////////////////////////////// - /// - /// \fn read_mesh - /// - /// \brief Read mesh from file - /// - /// \param Simulation mesh - /// \param Simulation state - /// \param Number of dimensions - /// - /// - ///////////////////////////////////////////////////////////////////////////// - void read_mesh(Mesh_t& mesh, - State_t& State, - mesh_input_t& mesh_inps, - int num_dims) - { - if (mesh_file_ == NULL) { - throw std::runtime_error("**** No mesh path given for read_mesh ****"); - } + printf("Creating a 3D box mesh \n"); - std::ifstream file(mesh_file_); - if (file.is_open()) { - std::cout << "The file exists." << std::endl; - file.close(); - } else { - throw std::runtime_error("**** Mesh path given does not exists ****"); - } + const int num_dim = 3; - // Check mesh file extension - // and read based on extension - std::string filePathStr(mesh_file_); - std::string extension; + // Note: In fierro, these come from the simulation parameters + const double lx = length[0]; + const double ly = length[1]; + const double lz = length[2]; - size_t pos = filePathStr.rfind('.'); - if (pos != std::string::npos) { - extension = filePathStr.substr(pos + 1); - } else { - extension = ""; - } + // Note: In fierro, these come from the simulation parameters + const int num_elems_i = num_elems_dim[0]; + const int num_elems_j = num_elems_dim[1]; + const int num_elems_k = num_elems_dim[2]; - std::cout << "File extension is: " << extension << std::endl; + const int num_points_i = num_elems_i + 1; // num points in x + const int num_points_j = num_elems_j + 1; // num points in y + const int num_points_k = num_elems_k + 1; // num points in y - if(extension == "geo"){ // Ensight meshfile extension - read_ensight_mesh(mesh, State.GaussPoints, State.node, State.corner, mesh_inps, num_dims); - } - else if(extension == "inp"){ // Abaqus meshfile extension - read_Abaqus_mesh(mesh, State, num_dims); - } - else if(extension == "vtk"){ // vtk file format - read_vtk_mesh(mesh, State.GaussPoints, State.node, State.corner, mesh_inps, num_dims); - } - else if(extension == "vtu"){ // vtu file format - read_vtu_mesh(mesh, State.GaussPoints, State.node, State.corner, mesh_inps, num_dims); - } - else{ - throw std::runtime_error("**** Mesh file extension not understood ****"); - } + const int num_nodes = num_points_i * num_points_j * num_points_k; - } + const double dx = lx / ((double)num_elems_i); // len/(num_elems_i) + const double dy = ly / ((double)num_elems_j); // len/(num_elems_j) + const double dz = lz / ((double)num_elems_k); // len/(num_elems_k) - ///////////////////////////////////////////////////////////////////////////// - /// - /// \fn read_ensight_mesh - /// - /// \brief Read .geo mesh file - /// - /// \param Simulation mesh - /// \param Element state struct - /// \param Node state struct - /// \param Corner state struct - /// \param Number of dimensions - /// - ///////////////////////////////////////////////////////////////////////////// - void read_ensight_mesh(Mesh_t& mesh, - GaussPoint_t& GaussPoints, - node_t& node, - corner_t& corner, - mesh_input_t& mesh_inps, - int num_dims) - { - FILE* in; - char ch; + const int num_elems = num_elems_i * num_elems_j * num_elems_k; - size_t num_nodes_in_elem = 1; - for (int dim = 0; dim < num_dims; dim++) { - num_nodes_in_elem *= 2; - } + // --- 3D parameters --- + // const int num_faces_in_elem = 6; // number of faces in elem + // const int num_points_in_elem = 8; // number of points in elem + // const int num_points_in_face = 4; // number of points in a face + // const int num_edges_in_elem = 12; // number of edges in a elem - // read the mesh WARNING: assumes a .geo file - in = fopen(mesh_file_, "r"); - - // skip 8 lines - for (int j = 1; j <= 8; j++) { - int i = 0; - while ((ch = (char)fgetc(in)) != '\n') { - i++; - } - } - - // --- Read in the nodes in the mesh --- - - size_t num_nodes = 0; - - fscanf(in, "%lu", &num_nodes); - printf("Number of nodes read in %lu\n", num_nodes); - - - mesh.initialize_nodes(num_nodes); + // initialize mesh node variables + mesh.initialize_nodes(num_nodes); // initialize node state variables, for now, we just need coordinates, the rest will be initialize by the respective solvers - std::vector required_node_state = { node_state::coords }; - node.initialize(num_nodes, num_dims, required_node_state); - - // read the initial mesh coordinates - // x-coords - for (int node_id = 0; node_id < mesh.num_nodes; node_id++) { - fscanf(in, "%le", &node.coords.host(node_id, 0)); - node.coords.host(node_id, 0)*= mesh_inps.scale_x; - } - - // y-coords - for (int node_id = 0; node_id < mesh.num_nodes; node_id++) { - fscanf(in, "%le", &node.coords.host(node_id, 1)); - node.coords.host(node_id, 1)*= mesh_inps.scale_y; - } - - // z-coords - for (int node_id = 0; node_id < mesh.num_nodes; node_id++) { - if (num_dims == 3) { - fscanf(in, "%le", &node.coords.host(node_id, 2)); - node.coords.host(node_id, 2)*= mesh_inps.scale_z; - } - else{ - double dummy; - fscanf(in, "%le", &dummy); - } - } // end for - - - // Update device nodal positions - node.coords.update_device(); - - ch = (char)fgetc(in); - - // skip 1 line - for (int j = 1; j <= 1; j++) { - int i = 0; - while ((ch = (char)fgetc(in)) != '\n') { - i++; - } - } - - // --- read in the elements in the mesh --- - size_t num_elem = 0; - - fscanf(in, "%lu", &num_elem); - printf("Number of elements read in %lu\n", num_elem); - - // initialize elem variables - mesh.initialize_elems(num_elem, num_dims); - // GaussPoints.initialize(num_elem, 3); // always 3D here, even for 2D - - - // for each cell read the list of associated nodes - for (int elem_gid = 0; elem_gid < num_elem; elem_gid++) { - for (int node_lid = 0; node_lid < num_nodes_in_elem; node_lid++) { - fscanf(in, "%lu", &mesh.nodes_in_elem.host(elem_gid, node_lid)); // %d vs zu - - // shift to start node index space at 0 - mesh.nodes_in_elem.host(elem_gid, node_lid) -= 1; - } - } - - // Convert from ensight to IJK mesh - int convert_ensight_to_ijk[8]; - convert_ensight_to_ijk[0] = 0; - convert_ensight_to_ijk[1] = 1; - convert_ensight_to_ijk[2] = 3; - convert_ensight_to_ijk[3] = 2; - convert_ensight_to_ijk[4] = 4; - convert_ensight_to_ijk[5] = 5; - convert_ensight_to_ijk[6] = 7; - convert_ensight_to_ijk[7] = 6; - - int tmp_ijk_indx[8]; - - for (int elem_gid = 0; elem_gid < num_elem; elem_gid++) { - for (int node_lid = 0; node_lid < num_nodes_in_elem; node_lid++) { - tmp_ijk_indx[node_lid] = mesh.nodes_in_elem.host(elem_gid, convert_ensight_to_ijk[node_lid]); - } - - for (int node_lid = 0; node_lid < num_nodes_in_elem; node_lid++){ - mesh.nodes_in_elem.host(elem_gid, node_lid) = tmp_ijk_indx[node_lid]; - } - } - // update device side - mesh.nodes_in_elem.update_device(); - - // initialize corner variables - int num_corners = num_elem * mesh.num_nodes_in_elem; - mesh.initialize_corners(num_corners); - // corner.initialize(num_corners, num_dims); - - // Close mesh input file - fclose(in); - - // Build connectivity - mesh.build_connectivity(); - - return; - } // end read ensight mesh - - ///////////////////////////////////////////////////////////////////////////// - /// - /// \fn read_Abaqus_mesh - /// - /// \brief Read .inp mesh file - /// - /// \param Simulation mesh - /// \param Simulation state - /// \param Node state struct - /// \param Number of dimensions - /// - ///////////////////////////////////////////////////////////////////////////// - void read_Abaqus_mesh(Mesh_t& mesh, - State_t& State, - int num_dims) - { - - std::cout<<"Reading abaqus input file for mesh"< nodes; - std::vector elements; - - std::string line; - bool readingNodes = false; - bool readingElements = false; - - while (std::getline(inputFile, line)) { - if (line.find("*Node") != std::string::npos) { - readingNodes = true; - std::cout<<"Found *Node"<> node.id && std::getline(iss, token, ',') && iss >> node.x && - std::getline(iss, token, ',') && iss >> node.y && - std::getline(iss, token, ',') && iss >> node.z)) { - std::cerr << "Failed to parse line: " << line << std::endl; - continue; // Skip this line if parsing failed - } - nodes.push_back(node); - } - - if (line.find("*Element") != std::string::npos) { - readingElements = true; - std::cout<<"Found *Element*"<> element.id)){ - std::cout << "Failed to parse line: " << line << std::endl; - continue; // Skip this line if parsing failed - } - - while ((std::getline(iss, token, ','))) { - // Now extract the integer, ignoring any trailing whitespace - int val; - iss >> val; - element.connectivity.push_back(val); - } - - // Convert from abaqus to IJK mesh - int convert_abq_to_ijk[8]; - convert_abq_to_ijk[0] = 0; - convert_abq_to_ijk[1] = 1; - convert_abq_to_ijk[2] = 3; - convert_abq_to_ijk[3] = 2; - convert_abq_to_ijk[4] = 4; - convert_abq_to_ijk[5] = 5; - convert_abq_to_ijk[6] = 7; - convert_abq_to_ijk[7] = 6; - - int tmp_ijk_indx[8]; - - for (int node_lid = 0; node_lid < 8; node_lid++) { - tmp_ijk_indx[node_lid] = element.connectivity[convert_abq_to_ijk[node_lid]]; - } - - for (int node_lid = 0; node_lid < 8; node_lid++){ - element.connectivity[node_lid] = tmp_ijk_indx[node_lid]; - } - - elements.push_back(element); - } - } - - inputFile.close(); - - size_t num_nodes = nodes.size(); - - printf("Number of nodes read in %lu\n", num_nodes); - - // initialize node variables - mesh.initialize_nodes(num_nodes); - - // initialize node state, for now, we just need coordinates, the rest will be initialize by the respective solvers - std::vector required_node_state = { node_state::coords }; - - State.node.initialize(num_nodes, num_dims, required_node_state); - - - // Copy nodes to mesh - for(int node_gid = 0; node_gid < num_nodes; node_gid++){ - State.node.coords.host(node_gid, 0) = nodes[node_gid].x; - State.node.coords.host(node_gid, 1) = nodes[node_gid].y; - State.node.coords.host(node_gid, 2) = nodes[node_gid].z; - } - - // Update device nodal positions - State.node.coords.update_device(); - - - // --- read in the elements in the mesh --- - size_t num_elem = elements.size(); - printf("Number of elements read in %lu\n", num_elem); - - // initialize elem variables - mesh.initialize_elems(num_elem, num_dims); - - - // for each cell read the list of associated nodes - for (int elem_gid = 0; elem_gid < num_elem; elem_gid++) { - for (int node_lid = 0; node_lid < 8; node_lid++) { - mesh.nodes_in_elem.host(elem_gid, node_lid) = elements[elem_gid].connectivity[node_lid]; - - // shift to start node index space at 0 - mesh.nodes_in_elem.host(elem_gid, node_lid) -= 1; - } - } + std::vector required_node_state = { node_state::coords }; + node.initialize(num_nodes, num_dim, required_node_state); - // update device side - mesh.nodes_in_elem.update_device(); + // --- Build nodes --- - // initialize corner variables - int num_corners = num_elem * mesh.num_nodes_in_elem; - mesh.initialize_corners(num_corners); - // State.corner.initialize(num_corners, num_dims); - - // Build connectivity - mesh.build_connectivity(); - } // end read abaqus mesh - - - ///////////////////////////////////////////////////////////////////////////// - /// - /// \fn read_vtk_mesh - /// - /// \brief Read ASCII .vtk mesh file - /// - /// \param Simulation mesh - /// \param Simulation state - /// \param Node state struct - /// \param Number of dimensions - /// - ///////////////////////////////////////////////////////////////////////////// - void read_vtk_mesh(Mesh_t& mesh, - GaussPoint_t& GaussPoints, - node_t& node, - corner_t& corner, - mesh_input_t& mesh_inps, - int num_dims) - { - - std::cout<<"Reading VTK mesh"< v = split (str, delimiter); - - // looking for the following text: - // POINTS %d float - if(v[0] == "POINTS"){ - size_t num_nodes = std::stoi(v[1]); - printf("Number of nodes read in %zu\n", num_nodes); - mesh.initialize_nodes(num_nodes); - - std::vector required_node_state = { node_state::coords }; - node.initialize(num_nodes, num_dims, required_node_state); - - found=true; - } // end if - - - if (i>1000){ - std::cerr << "ERROR: Failed to find POINTS in file" << std::endl; - break; - } // end if - - i++; - } // end while - - // read the node coordinates - for (node_gid=0; node_gid v = split (str, delimiter); - - // save the nodal coordinates - node.coords.host(node_gid, 0) = mesh_inps.scale_x*std::stod(v[0]); // double - node.coords.host(node_gid, 1) = mesh_inps.scale_y*std::stod(v[1]); // double - if(num_dims==3){ - node.coords.host(node_gid, 2) = mesh_inps.scale_z*std::stod(v[2]); // double - } - - } // end for nodes - - - // Update device nodal positions - node.coords.update_device(); - - - found=false; - - // look for CELLS - i = 0; - size_t num_elem = 0; - while (found==false) { - std::string str; - std::getline(in, str); - - std::string delimiter = " "; - std::vector v = split (str, delimiter); - std::cout << v[0] << std::endl; // printing - - // looking for the following text: - // CELLS num_elem size - if(v[0] == "CELLS"){ - num_elem = std::stoi(v[1]); - printf("Number of elements read in %zu\n", num_elem); - - // initialize elem variables - mesh.initialize_elems(num_elem, num_dims); - - found=true; - } // end if - - - if (i>1000){ - printf("ERROR: Failed to find CELLS \n"); - break; - } // end if - - i++; - } // end while - - - // read the node ids in the element - for (elem_gid=0; elem_gid v = split (str, delimiter); - num_nodes_in_elem = std::stoi(v[0]); - - for (size_t node_lid=0; node_lid v = split (str, delimiter); - - // looking for the following text: - // CELLS num_elem size - if(v[0] == "CELL_TYPES"){ - - std::getline(in, str); - elem_type = std::stoi(str); - - found=true; - } // end if - - - if (i>1000){ - printf("ERROR: Failed to find elem_TYPE \n"); - break; - } // end if - - i++; - } // end while - printf("Element type = %zu \n", elem_type); - // elem types: - // linear hex = 12, linear quad = 9 - found=false; - - - if(num_nodes_in_elem==8 & elem_type != 12) { - printf("Wrong element type of %zu \n", elem_type); - std::cerr << "ERROR: incorrect element type in VTK file" << std::endl; - } - - in.close(); - - } // end of VTKread function - - - ///////////////////////////////////////////////////////////////////////////// - /// - /// \fn read_vtu_mesh - /// - /// \brief Read ASCII .vtu mesh file - /// - /// \param Simulation mesh - /// \param Simulation state - /// \param Node state struct - /// \param Number of dimensions - /// - ///////////////////////////////////////////////////////////////////////////// - void read_vtu_mesh(Mesh_t& mesh, - GaussPoint_t& GaussPoints, - node_t& node, - corner_t& corner, - mesh_input_t& mesh_inps, - int num_dims) - { - - std::cout<<"Reading VTU file in a multiblock VTK mesh"< required_node_state = { node_state::coords }; - node.initialize(num_nodes, num_dims, required_node_state); - - //------------------------------------ - // allocate the elem object id array - mesh_inps.object_ids = DCArrayKokkos (num_elems, "ObjectIDs"); - - - // ------------------------ - // Mesh file storage order: - // objectId - // Points - // connectivity - // offsets - // types - // ------------------------ - - // temporary arrays - DCArrayKokkos node_coords(num_nodes,3, "node_coords_vtu_file"); // always 3 with vtu files - DCArrayKokkos connectivity(num_elems,num_nodes_in_elem, "connectivity_vtu_file"); - DCArrayKokkos elem_types(num_elems, "elem_types_vtu_file"); // element types - - - // for all fields, we stop recording when we get to "<" - std::string stop = "<"; - - // the size of 1D storage from reading the mesh file - size_t size; - - // --- - // Object ids - // --- - - // the object id in the element - // array dims are (num_elems) - found = extract_values_xml(mesh_inps.object_ids.host.pointer(), - "\"ObjectId\"", - stop, - in, - size); - if(found==false){ - throw std::runtime_error("ERROR: ObjectIDs were not found in the XML file!"); - //std::cout << "ERROR: ObjectIDs were not found in the XML file!" << std::endl; - } - mesh_inps.object_ids.update_device(); - - - // --- - // Nodal coordinates of mesh - // --- - - // coordinates of the node - // array dims are (num_nodes,dims) - // must use the quotes around Points to read the point values - found = extract_values_xml(node_coords.host.pointer(), - "\"Points\"", - stop, - in, - size); - if(found==false){ - throw std::runtime_error("**** ERROR: mesh nodes were not found in the XML file! ****"); - //std::cout << "ERROR: mesh nodes were not found in the XML file!" << std::endl; - } - if (size!=num_nodes*3){ - throw std::runtime_error("ERROR: failed to read all the mesh nodes!"); - //std::cout << "ERROR: failed to read all the mesh nodes!" << std::endl; - } - node_coords.update_device(); - - // dimensional scaling of the mesh - const double scl_x = mesh_inps.scale_x; - const double scl_y = mesh_inps.scale_y; - const double scl_z = mesh_inps.scale_z; - - // save the node coordinates to the state array - FOR_ALL(node_gid, 0, mesh.num_nodes, { - - // save the nodal coordinates - node.coords(node_gid, 0) = scl_x*node_coords(node_gid, 0); // double - node.coords(node_gid, 1) = scl_y*node_coords(node_gid, 1); // double - if(num_dims==3){ - node.coords(node_gid, 2) = scl_z*node_coords(node_gid, 2); // double - } - - }); // end for parallel nodes - node.coords.update_host(); - - - // --- - // Nodes in the element - // --- - - // fill temporary nodes in the element array - // array dims are (num_elems,num_nodes_in_elem) - found = extract_values_xml(connectivity.host.pointer(), - "\"connectivity\"", - stop, - in, - size); - if(found==false){ - std::cout << "ERROR: mesh connectivity was not found in the XML file!" << std::endl; - } - connectivity.update_device(); - - // array dims are the (num_elems) - // 8 = pixal i,j,k linear quad format - // 9 = linear quad ensight ordering - // 12 = linear ensight hex ordering - // 72 = VTK_LAGRANGE_HEXAHEDRON - // .... - found = extract_values_xml(elem_types.host.pointer(), - "\"types\"", - stop, - in, - size); - if(found==false){ - std::cout << "ERROR: element types were not found in the XML file!" << std::endl; - } - elem_types.update_device(); - - // check that the element type is supported by Fierro - FOR_ALL (elem_gid, 0, mesh.num_elems, { - if(elem_types(elem_gid) == element_types::linear_quad || - elem_types(elem_gid) == element_types::linear_hex_ijk || - elem_types(elem_gid) == element_types::linear_hex || - elem_types(elem_gid) == element_types::arbitrary_hex ) - { - // at least one of them is true - } - else - { - // unknown element used - Kokkos::abort("Unknown element type in the mesh \n"); - } - }); - - // Convert from ensight linear hex to a IJK mesh - CArrayKokkos convert_ensight_to_ijk(8, "convert_ensight_to_ijk"); - - // Convert the arbitrary order hex to a IJK mesh - DCArrayKokkos convert_pn_vtk_to_ijk(mesh.num_nodes_in_elem, "convert_pn_vtk_to_ijk"); - - //build the connectivity for element type 12 - // elem_types.host(0) - switch(elem_types.host(0)){ - - case element_types::linear_quad: - // the node order is correct, no changes required - - FOR_ALL (elem_gid, 0, mesh.num_elems, { - - for (size_t node_lid=0; node_lid origin(num_dim); - // SimulationParamaters.mesh_input.origin.update_host(); - for (int i = 0; i < num_dim; i++) { origin[i] = SimulationParamaters.mesh_input.origin[i]; } - - // --- 2D parameters --- - // const int num_faces_in_elem = 4; // number of faces in elem - // const int num_points_in_elem = 4; // number of points in elem - // const int num_points_in_face = 2; // number of points in a face - // const int num_edges_in_elem = 4; // number of edges in a elem - - // --- mesh node ordering --- - // Convert ijk index system to the finite element numbering convention - // for vertices in elem - auto convert_point_number_in_quad = CArray(4); - convert_point_number_in_quad(0) = 0; - convert_point_number_in_quad(1) = 1; - convert_point_number_in_quad(2) = 3; - convert_point_number_in_quad(3) = 2; - - // intialize node variables - mesh.initialize_nodes(num_nodes); - - // initialize node state, for now, we just need coordinates, the rest will be initialize by the respective solvers - std::vector required_node_state = { node_state::coords }; - node.initialize(num_nodes, num_dim, required_node_state); - - // --- Build nodes --- - - // populate the point data structures + // populate the point data structures + for (int k = 0; k < num_points_k; k++) { for (int j = 0; j < num_points_j; j++) { for (int i = 0; i < num_points_i; i++) { // global id for the point - int node_gid = get_id(i, j, 0, num_points_i, num_points_j); + int node_gid = get_id(i, j, k, num_points_i, num_points_j); // store the point coordinates node.coords.host(node_gid, 0) = origin[0] + (double)i * dx; node.coords.host(node_gid, 1) = origin[1] + (double)j * dy; + node.coords.host(node_gid, 2) = origin[2] + (double)k * dz; } // end for i } // end for j + } // end for k + + node.coords.update_device(); - node.coords.update_device(); + // initialize elem variables + mesh.initialize_elems(num_elems, num_dim); - // initialize elem variables - mesh.initialize_elems(num_elems, num_dim); + // --- Build elems --- - // populate the elem center data structures + // populate the elem center data structures + for (int k = 0; k < num_elems_k; k++) { for (int j = 0; j < num_elems_j; j++) { for (int i = 0; i < num_elems_i; i++) { // global id for the elem - int elem_gid = get_id(i, j, 0, num_elems_i, num_elems_j); + int elem_gid = get_id(i, j, k, num_elems_i, num_elems_j); // store the point IDs for this elem where the range is - // (i:i+1, j:j+1 for a linear quad + // (i:i+1, j:j+1, k:k+1) for a linear hexahedron int this_point = 0; + for (int kcount = k; kcount <= k + 1; kcount++) { + for (int jcount = j; jcount <= j + 1; jcount++) { + for (int icount = i; icount <= i + 1; icount++) { + // global id for the points + int node_gid = get_id(icount, jcount, kcount, + num_points_i, num_points_j); - for (int jcount = j; jcount <= j + 1; jcount++) { - for (int icount = i; icount <= i + 1; icount++) { - // global id for the points - int node_gid = get_id(icount, jcount, 0, num_points_i, num_points_j); + // convert this_point index to the FE index convention + int this_index = this_point; //convert_point_number_in_Hex(this_point); - // convert this_point index to the FE index convention - int this_index = convert_point_number_in_quad(this_point); - - // store the points in this elem according the the finite - // element numbering convention - mesh.nodes_in_elem.host(elem_gid, this_index) = node_gid; + // store the points in this elem according the the finite + // element numbering convention + mesh.nodes_in_elem.host(elem_gid, this_index) = node_gid; - // increment the point counting index - this_point = this_point + 1; - } // end for icount - } // end for jcount + // increment the point counting index + this_point = this_point + 1; + } // end for icount + } // end for jcount + } // end for kcount } // end for i } // end for j + } // end for k - // update device side - mesh.nodes_in_elem.update_device(); - - // intialize corner variables - int num_corners = num_elems * mesh.num_nodes_in_elem; - mesh.initialize_corners(num_corners); - // corner.initialize(num_corners, num_dim); - - // Build connectivity - mesh.build_connectivity(); - } // end build_2d_box - - ///////////////////////////////////////////////////////////////////////////// - /// - /// \fn build_2d_polar - /// - /// \brief Builds an unstructured 2D polar mesh - /// - /// \param Simulation mesh that is built - /// \param Element state data - /// \param Node state data - /// \param Corner state data - /// \param Simulation parameters - /// - ///////////////////////////////////////////////////////////////////////////// - void build_2d_polar(Mesh_t& mesh, - GaussPoint_t& GaussPoints, - node_t& node, - corner_t& corner, - SimulationParameters_t& SimulationParamaters) const - { - printf("Creating a 2D polar mesh \n"); - - int num_dim = 2; - - const double inner_radius = SimulationParamaters.mesh_input.inner_radius; - const double outer_radius = SimulationParamaters.mesh_input.outer_radius; - - const double start_angle = PI / 180.0 * SimulationParamaters.mesh_input.starting_angle; - const double end_angle = PI / 180.0 * SimulationParamaters.mesh_input.ending_angle; - - const int num_elems_i = SimulationParamaters.mesh_input.num_radial_elems; - const int num_elems_j = SimulationParamaters.mesh_input.num_angular_elems; - - const int num_points_i = num_elems_i + 1; // num points in x - const int num_points_j = num_elems_j + 1; // num points in y - - const int num_nodes = num_points_i * num_points_j; - - const double dx = (outer_radius - inner_radius) / ((double)num_elems_i); // len/(elems) - const double dy = (end_angle - start_angle) / ((double)num_elems_j); // len/(elems) - - const int num_elems = num_elems_i * num_elems_j; - - std::vector origin(num_dim); - - for (int i = 0; i < num_dim; i++) { origin[i] = SimulationParamaters.mesh_input.origin[i]; } - - // --- 2D parameters --- - // const int num_faces_in_elem = 4; // number of faces in elem - // const int num_points_in_elem = 4; // number of points in elem - // const int num_points_in_face = 2; // number of points in a face - // const int num_edges_in_elem = 4; // number of edges in a elem - - // --- mesh node ordering --- - // Convert ijk index system to the finite element numbering convention - // for vertices in elem - auto convert_point_number_in_quad = CArray(4); - convert_point_number_in_quad(0) = 0; - convert_point_number_in_quad(1) = 1; - convert_point_number_in_quad(2) = 3; - convert_point_number_in_quad(3) = 2; - - // intialize node variables - mesh.initialize_nodes(num_nodes); - - // initialize node state, for now, we just need coordinates, the rest will be initialize by the respective solvers - std::vector required_node_state = { node_state::coords }; - node.initialize(num_nodes, num_dim, required_node_state); - - // populate the point data structures - for (int j = 0; j < num_points_j; j++) { - for (int i = 0; i < num_points_i; i++) { - // global id for the point - int node_gid = get_id(i, j, 0, num_points_i, num_points_j); - - double r_i = inner_radius + (double)i * dx; - double theta_j = start_angle + (double)j * dy; - - // store the point coordinates - node.coords.host(node_gid, 0) = origin[0] + r_i * cos(theta_j); - node.coords.host(node_gid, 1) = origin[1] + r_i * sin(theta_j); - - if(node.coords.host(node_gid, 0) < 0.0){ - throw std::runtime_error("**** NODE RADIUS FOR RZ MESH MUST BE POSITIVE ****"); - } - - } // end for i - } // end for j - - - node.coords.update_device(); - - // initialize elem variables - mesh.initialize_elems(num_elems, num_dim); - - // populate the elem center data structures - for (int j = 0; j < num_elems_j; j++) { - for (int i = 0; i < num_elems_i; i++) { - // global id for the elem - int elem_gid = get_id(i, j, 0, num_elems_i, num_elems_j); - - // store the point IDs for this elem where the range is - // (i:i+1, j:j+1 for a linear quad - int this_point = 0; - - for (int jcount = j; jcount <= j + 1; jcount++) { - for (int icount = i; icount <= i + 1; icount++) { - // global id for the points - int node_gid = get_id(icount, jcount, 0, num_points_i, num_points_j); - - // convert this_point index to the FE index convention - int this_index = convert_point_number_in_quad(this_point); - - // store the points in this elem according the the finite - // element numbering convention - mesh.nodes_in_elem.host(elem_gid, this_index) = node_gid; - - // increment the point counting index - this_point = this_point + 1; - } // end for icount - } // end for jcount - } // end for i - } // end for j - - // update device side - mesh.nodes_in_elem.update_device(); - - // intialize corner variables - int num_corners = num_elems * mesh.num_nodes_in_elem; - mesh.initialize_corners(num_corners); - // corner.initialize(num_corners, num_dim); - - // Build connectivity - mesh.build_connectivity(); - } // end build_2d_box - - ///////////////////////////////////////////////////////////////////////////// - /// - /// \fn build_3d_box - /// - /// \brief Builds an unstructured 3D rectilinear mesh - /// - /// \param Simulation mesh that is built - /// \param Element state data - /// \param Node state data - /// \param Corner state data - /// \param Simulation parameters - /// - ///////////////////////////////////////////////////////////////////////////// - void build_3d_box(Mesh_t& mesh, - GaussPoint_t& GaussPoints, - node_t& node, - corner_t& corner, - SimulationParameters_t& SimulationParamaters) const - { - printf("Creating a 3D box mesh \n"); - - const int num_dim = 3; - - // SimulationParamaters.mesh_input.length.update_host(); - const double lx = SimulationParamaters.mesh_input.length[0]; - const double ly = SimulationParamaters.mesh_input.length[1]; - const double lz = SimulationParamaters.mesh_input.length[2]; - - // SimulationParamaters.mesh_input.num_elems.update_host(); - const int num_elems_i = SimulationParamaters.mesh_input.num_elems[0]; - const int num_elems_j = SimulationParamaters.mesh_input.num_elems[1]; - const int num_elems_k = SimulationParamaters.mesh_input.num_elems[2]; - - const int num_points_i = num_elems_i + 1; // num points in x - const int num_points_j = num_elems_j + 1; // num points in y - const int num_points_k = num_elems_k + 1; // num points in y - - const int num_nodes = num_points_i * num_points_j * num_points_k; - - const double dx = lx / ((double)num_elems_i); // len/(num_elems_i) - const double dy = ly / ((double)num_elems_j); // len/(num_elems_j) - const double dz = lz / ((double)num_elems_k); // len/(num_elems_k) - - const int num_elems = num_elems_i * num_elems_j * num_elems_k; - - std::vector origin(num_dim); - // SimulationParamaters.mesh_input.origin.update_host(); - for (int i = 0; i < num_dim; i++) { origin[i] = SimulationParamaters.mesh_input.origin[i]; } - - // --- 3D parameters --- - // const int num_faces_in_elem = 6; // number of faces in elem - // const int num_points_in_elem = 8; // number of points in elem - // const int num_points_in_face = 4; // number of points in a face - // const int num_edges_in_elem = 12; // number of edges in a elem - - - // initialize mesh node variables - mesh.initialize_nodes(num_nodes); - - // initialize node state variables, for now, we just need coordinates, the rest will be initialize by the respective solvers - std::vector required_node_state = { node_state::coords }; - node.initialize(num_nodes, num_dim, required_node_state); - - // --- Build nodes --- - - // populate the point data structures - for (int k = 0; k < num_points_k; k++) { - for (int j = 0; j < num_points_j; j++) { - for (int i = 0; i < num_points_i; i++) { - // global id for the point - int node_gid = get_id(i, j, k, num_points_i, num_points_j); - - // store the point coordinates - node.coords.host(node_gid, 0) = origin[0] + (double)i * dx; - node.coords.host(node_gid, 1) = origin[1] + (double)j * dy; - node.coords.host(node_gid, 2) = origin[2] + (double)k * dz; - } // end for i - } // end for j - } // end for k - - - node.coords.update_device(); - - // initialize elem variables - mesh.initialize_elems(num_elems, num_dim); - - // --- Build elems --- - - // populate the elem center data structures - for (int k = 0; k < num_elems_k; k++) { - for (int j = 0; j < num_elems_j; j++) { - for (int i = 0; i < num_elems_i; i++) { - // global id for the elem - int elem_gid = get_id(i, j, k, num_elems_i, num_elems_j); - - // store the point IDs for this elem where the range is - // (i:i+1, j:j+1, k:k+1) for a linear hexahedron - int this_point = 0; - for (int kcount = k; kcount <= k + 1; kcount++) { - for (int jcount = j; jcount <= j + 1; jcount++) { - for (int icount = i; icount <= i + 1; icount++) { - // global id for the points - int node_gid = get_id(icount, jcount, kcount, - num_points_i, num_points_j); - - // convert this_point index to the FE index convention - int this_index = this_point; //convert_point_number_in_Hex(this_point); - - // store the points in this elem according the the finite - // element numbering convention - mesh.nodes_in_elem.host(elem_gid, this_index) = node_gid; - - // increment the point counting index - this_point = this_point + 1; - } // end for icount - } // end for jcount - } // end for kcount - } // end for i - } // end for j - } // end for k - - // update device side - mesh.nodes_in_elem.update_device(); - - // initialize corner variables - int num_corners = num_elems * mesh.num_nodes_in_elem; - mesh.initialize_corners(num_corners); - // corner.initialize(num_corners, num_dim); - - // Build connectivity - mesh.build_connectivity(); - } // end build_3d_box - - ///////////////////////////////////////////////////////////////////////////// - /// - /// \fn build_3d_HexN_box - /// - /// \brief Builds an unstructured high order 3D rectilinear mesh - /// - /// \param Simulation mesh that is built - /// \param Element state data - /// \param Node state data - /// \param Corner state data - /// \param Simulation parameters - /// - ///////////////////////////////////////////////////////////////////////////// - void build_3d_HexN_box(Mesh_t& mesh, - GaussPoint_t& GaussPoints, - node_t& node, - corner_t& corner, - SimulationParameters_t& SimulationParamaters) const - { - printf(" ***** WARNING:: build_3d_HexN_box not yet implemented\n"); - const int num_dim = 3; - - // SimulationParamaters.mesh_input.length.update_host(); - const double lx = SimulationParamaters.mesh_input.length[0]; - const double ly = SimulationParamaters.mesh_input.length[1]; - const double lz = SimulationParamaters.mesh_input.length[2]; - - // SimulationParamaters.mesh_input.num_elems.update_host(); - const int num_elems_i = SimulationParamaters.mesh_input.num_elems[0]; - const int num_elems_j = SimulationParamaters.mesh_input.num_elems[1]; - const int num_elems_k = SimulationParamaters.mesh_input.num_elems[2]; - - // creating zones for the Pn order - const int Pn_order = SimulationParamaters.mesh_input.p_order; - - if (Pn_order > 19) { - printf("Fierro DG and RD solvers are only valid for elements up to Pn = 19 \n"); - return; - } - - const int num_zones_i = Pn_order*num_elems_i; - const int num_zones_j = Pn_order*num_elems_j; - const int num_zones_k = Pn_order*num_elems_k; - - const int num_points_i = num_zones_i+1; // num points in x accounting for Pn - const int num_points_j = num_zones_j+1; // num points in y accounting for Pn - const int num_points_k = num_zones_k+1; // num points in y accounting for Pn - - - const double dx = lx/((double)num_zones_i); // len/(num_zones_i) - const double dy = ly/((double)num_zones_j); // len/(num_zones_j) - const double dz = lz/((double)num_zones_k); // len/(num_zones_k) - - const int num_elems = num_elems_i*num_elems_j*num_elems_k; - // const int num_zones = num_zones_i*num_zones_j*num_zones_k; // accounts for Pn - - std::vector origin(num_dim); - for (int i = 0; i < num_dim; i++) { origin[i] = SimulationParamaters.mesh_input.origin[i]; } - - // --- 3D parameters --- - // const int num_faces_in_zone = 6; // number of faces in zone - // const int num_points_in_zone = 8; // number of points in zone - // const int num_points_in_face = 4; // number of points in a face - - // p_order = 1, 2, 3, 4, 5 - // num_nodes = 2, 3, 4, 5, 6 - const int num_1D_points = Pn_order+1; - const int num_points_in_elem = num_1D_points*num_1D_points*num_1D_points; - - - // --- elem --- - auto elem_coords = CArray (num_elems, num_dim); - auto elem_point_list = CArray (num_elems, num_points_in_elem); - - - // --- point --- - int num_points = num_points_i * num_points_j * num_points_k; - auto pt_coords = CArray (num_points, num_dim); - - - // --- Build nodes --- - - // initialize node variables - mesh.initialize_nodes(num_points); - - // - std::vector required_node_state = { node_state::coords }; - node.initialize(num_points, num_dim, required_node_state); - // populate the point data structures - for (int k = 0; k < num_points_k; k++){ - for (int j = 0; j < num_points_j; j++){ - for (int i = 0; i < num_points_i; i++){ - - - // global id for the point - int node_gid = get_id(i, j, k, num_points_i, num_points_j); - - // store the point coordinates - node.coords.host(node_gid, 0) = origin[0] + (double)i * dx; - node.coords.host(node_gid, 1) = origin[1] + (double)j * dy; - node.coords.host(node_gid, 2) = origin[2] + (double)k * dz; - - } // end for k - } // end for i - } // end for j - - - node.coords.update_device(); - - - // initialize elem variables - mesh.initialize_elems(num_elems, num_dim); - - // --- Build elems --- - - // populate the elem center data structures accounting for Pn - for (int k=0; k graphics_times, - std::vector node_states, - std::vector gauss_pt_states, - std::vector material_pt_states, - const size_t solver_id) - { - - - // node_state is an enum for possible fields (e.g., coords, velocity, etc.), see state.h - // gauss_pt_state is an enum for possible fields (e.g., vol, divergence, etc.) - // material_pt_state is an enum for possible fields (e.g., den, pres, etc.) - - - // ******************* - // Update host - // ******************* - - const size_t num_mats = State.MaterialPoints.num_material_points.size(); - - // material point values - - // Update host data for mat_pt state - for (auto field : material_pt_states){ - switch(field){ - // scalar vars to write out - case material_pt_state::density: - State.MaterialPoints.den.update_host(); - break; - case material_pt_state::pressure: - State.MaterialPoints.pres.update_host(); - break; - case material_pt_state::specific_internal_energy: - State.MaterialPoints.sie.update_host(); - break; - case material_pt_state::sound_speed: - State.MaterialPoints.sspd.update_host(); - break; - case material_pt_state::mass: - State.MaterialPoints.mass.update_host(); - break; - case material_pt_state::volume_fraction: - State.MaterialPoints.volfrac.update_host(); - State.MaterialPoints.geo_volfrac.update_host(); - break; - case material_pt_state::eroded_flag: - State.MaterialPoints.eroded.update_host(); - break; - // tensor vars to write out - case material_pt_state::stress: - State.MaterialPoints.stress.update_host(); - break; - - // additional vars for thermal-mechanical solver - case material_pt_state::thermal_conductivity: - State.MaterialPoints.conductivity.update_host(); - break; - - case material_pt_state::specific_heat: - State.MaterialPoints.specific_heat.update_host(); - break; - - // add other variables here - - // not used - case material_pt_state::elastic_modulii: - break; - case material_pt_state::shear_modulii: - break; - case material_pt_state::poisson_ratios: - break; - case material_pt_state::heat_flux: - break; - default: - std::cout<<"Desired material point state not understood in outputs"< elem_scalar_var_names(num_elem_scalar_vars); - std::vector elem_tensor_var_names(num_elem_tensor_vars); - - // Scalar, vector, and tensor values associated with a material in part elems - std::vector mat_elem_scalar_var_names(num_mat_pt_scalar_vars); - std::vector mat_elem_tensor_var_names(num_mat_pt_tensor_vars); - - - // the ids to access a variable in the mat_scalar_var_name or tensor list - int mat_den_id = -1; - int mat_pres_id = -1; - int mat_sie_id = -1; - int mat_sspd_id = -1; - int mat_mass_id = -1; - int mat_volfrac_id = -1; - int mat_geo_volfrac_id = -1; // geometric volume fraction of part - int mat_eroded_id = -1; - int mat_stress_id = -1; - - int mat_conductivity_id = -1; - int mat_specific_heat_id = -1; - - // the index for the scalar, vector, and tensor fields - size_t var = 0; - size_t vector_var = 0; - size_t tensor_var = 0; - - // material point state to output - for (auto field : SimulationParamaters.output_options.output_mat_pt_state){ - switch(field){ - // scalar vars - case material_pt_state::density: - mat_elem_scalar_var_names[var] = "mat_den"; - mat_den_id = var; - var++; - break; - case material_pt_state::pressure: - mat_elem_scalar_var_names[var] = "mat_pres"; - mat_pres_id = var; - var++; - break; - case material_pt_state::specific_internal_energy: - mat_elem_scalar_var_names[var] = "mat_sie"; - mat_sie_id = var; - var++; - break; - case material_pt_state::sound_speed: - mat_elem_scalar_var_names[var] = "mat_sspd"; - mat_sspd_id = var; - var++; - break; - case material_pt_state::mass: - mat_elem_scalar_var_names[var] = "mat_mass"; - mat_mass_id = var; - var++; - break; - case material_pt_state::volume_fraction: - mat_elem_scalar_var_names[var] = "mat_volfrac"; - mat_volfrac_id = var; - var++; - - mat_elem_scalar_var_names[var] = "mat_geo_volfrac"; - mat_geo_volfrac_id = var; - var++; - break; - case material_pt_state::eroded_flag: - mat_elem_scalar_var_names[var] = "mat_eroded"; - mat_eroded_id = var; - var++; - break; - // tensor vars - case material_pt_state::stress: - mat_elem_tensor_var_names[tensor_var] = "mat_stress"; - mat_stress_id = tensor_var; - tensor_var++; - break; - - - // additional vars for thermal-mechanical solver - case material_pt_state::thermal_conductivity: - mat_elem_scalar_var_names[var] = "mat_thermal_K"; - mat_conductivity_id = var; - var++; - break; - - case material_pt_state::specific_heat: - mat_elem_scalar_var_names[var] = "mat_Cp"; - mat_specific_heat_id = var; - var++; - break; - - - // add other variables here - - // not used - case material_pt_state::elastic_modulii: - break; - case material_pt_state::shear_modulii: - break; - case material_pt_state::poisson_ratios: - break; - case material_pt_state::heat_flux: - break; - } // end switch - } // end for over mat_pt_states - - - // element average fields to output - - // the ids to access a variable in the elem_scalar_var_name or tensor list - int den_id = -1; - int pres_id = -1; - int sie_id = -1; - int sspd_id = -1; - int mass_id = -1; - int stress_id = -1; - - int conductivity_id = -1; - int specific_heat_id = -1; - - // reset the counters - var = 0; - vector_var = 0; - tensor_var = 0; - - // element state to output - for (auto field : SimulationParamaters.output_options.output_elem_state){ - switch(field){ - // scalar vars - case material_pt_state::density: - elem_scalar_var_names[var] = "den"; - den_id = var; - var++; - break; - case material_pt_state::pressure: - elem_scalar_var_names[var] = "pres"; - pres_id = var; - var++; - break; - case material_pt_state::specific_internal_energy: - elem_scalar_var_names[var] = "sie"; - sie_id = var; - var++; - break; - case material_pt_state::sound_speed: - elem_scalar_var_names[var] = "sspd"; - sspd_id = var; - var++; - break; - case material_pt_state::mass: - elem_scalar_var_names[var] = "mass"; - mass_id = var; - var++; - break; - // tensor vars - case material_pt_state::stress: - elem_tensor_var_names[tensor_var] = "stress"; - stress_id = tensor_var; - tensor_var++; - break; - - // heat transfer variables - case material_pt_state::thermal_conductivity: - elem_scalar_var_names[var] = "thermal_K"; - conductivity_id = var; - var++; - break; - - case material_pt_state::specific_heat: - elem_scalar_var_names[var] = "Cp"; - specific_heat_id = var; - var++; - break; - - // add other variables here - - // not used - case material_pt_state::volume_fraction: - break; - case material_pt_state::eroded_flag: - break; - case material_pt_state::elastic_modulii: - break; - case material_pt_state::shear_modulii: - break; - case material_pt_state::poisson_ratios: - break; - case material_pt_state::heat_flux: - break; - } // end switch - } // end for over mat_pt_states - - // append Gauss point vars to the element arrays - int vol_id = -1; - int div_id = -1; - int level_set_id = -1; - int vel_grad_id = -1; - - - for (auto field : SimulationParamaters.output_options.output_gauss_pt_state){ - switch(field){ - // scalars - case gauss_pt_state::volume: - elem_scalar_var_names[var] = "vol"; - vol_id = var; - var++; - break; - case gauss_pt_state::divergence_velocity: - elem_scalar_var_names[var] = "div"; - div_id = var; - var++; - break; - - case gauss_pt_state::level_set: - elem_scalar_var_names[var] = "level_set"; - level_set_id = var; - var++; - break; - - // tensors - case gauss_pt_state::gradient_velocity: - elem_tensor_var_names[tensor_var] = "vel_grad"; - vel_grad_id = tensor_var; - tensor_var++; - break; - } // end switch - } // end loop over gauss_pt_states - - - // ******************* - // nodal values - // ******************* - - size_t num_node_scalar_vars = 0; - size_t num_node_vector_vars = 0; - - for (auto field : SimulationParamaters.output_options.output_node_state){ - switch(field){ - // --- scalars - case node_state::mass: - num_node_scalar_vars ++; - break; - case node_state::temp: - num_node_scalar_vars ++; - break; - // -- vectors - case node_state::coords: - num_node_vector_vars ++; - break; - case node_state::velocity: - num_node_vector_vars ++; // for velocity - num_node_vector_vars ++; // for acceleration - break; - case node_state::gradient_level_set: - num_node_vector_vars ++; - break; - case node_state::force: - break; - - // heat transer vars - case node_state::heat_transfer: - break; - } // end switch - } // end for over - Kokkos::fence(); - - - // Scalar and vector values associated with a node - std::vector node_scalar_var_names(num_node_scalar_vars); - std::vector node_vector_var_names(num_node_vector_vars); - - int node_mass_id = -1; - int node_vel_id = -1; - int node_accel_id = -1; - int node_coord_id = -1; - int node_temp_id = -1; - int node_grad_level_set_id = -1; - - // reset counters for node fields - var = 0; - vector_var = 0; - tensor_var = 0; - - for (auto field : SimulationParamaters.output_options.output_node_state){ - switch(field){ - // scalars - case node_state::mass: - node_scalar_var_names[var] = "node_mass"; - node_mass_id = var; - var++; - break; - case node_state::temp: - node_scalar_var_names[var] = "node_temp"; - node_temp_id = var; - var++; - break; - - // vector fields - - case node_state::coords: - node_vector_var_names[vector_var] = "node_coords"; - node_coord_id = vector_var; - vector_var++; - break; - - case node_state::velocity: - node_vector_var_names[vector_var] = "node_vel"; - node_vel_id = vector_var; - vector_var++; - - node_vector_var_names[vector_var] = "node_accel"; - node_accel_id = vector_var; - vector_var++; - break; - - case node_state::gradient_level_set: - node_vector_var_names[vector_var] = "node_grad_lvlset"; - node_grad_level_set_id = vector_var; - vector_var++; - break; - - // -- not used vars - case node_state::force: - break; - - // heat transer vars - case node_state::heat_transfer: - break; - - // tensors - - } // end switch - } // end for over - - - // ************************************** - // build and save element average fields - // ************************************** - - // short hand - const size_t num_nodes = mesh.num_nodes; - const size_t num_elems = mesh.num_elems; - const size_t num_dims = mesh.num_dims; - const size_t num_nodes_in_elem = mesh.num_nodes_in_elem; - const int Pn_order = mesh.Pn; - - // save the elem state to an array for exporting to graphics files - DCArrayKokkos elem_scalar_fields(num_elem_scalar_vars, num_elems, "elem_scalars"); - DCArrayKokkos elem_tensor_fields(num_elem_tensor_vars, num_elems, 3, 3, "elem_tensors"); - elem_scalar_fields.set_values(0.0); - elem_tensor_fields.set_values(0.0); - - - // ----------------------------------------------------------------------- - // save the output fields to a single element average array for all state - // ----------------------------------------------------------------------- - for (int mat_id = 0; mat_id < num_mats; mat_id++) { - - // material point and guass point state are concatenated together - concatenate_elem_fields(State.MaterialPoints, - State.GaussPoints, - elem_scalar_fields, - elem_tensor_fields, - State.MaterialToMeshMaps.elem_in_mat_elem, - SimulationParamaters.output_options.output_elem_state, - SimulationParamaters.output_options.output_gauss_pt_state, - State.MaterialToMeshMaps.num_mat_elems.host(mat_id), - mat_id, - num_elems, - den_id, - pres_id, - sie_id, - sspd_id, - mass_id, - stress_id, - vol_id, - div_id, - level_set_id, - vel_grad_id, - conductivity_id, - specific_heat_id); - } // end for mats - - // make specific fields for the element average - if (sie_id>=0){ - FOR_ALL(elem_gid, 0, num_elems, { - // get sie by dividing by the mass - elem_scalar_fields(sie_id, elem_gid) /= (elem_scalar_fields(mass_id, elem_gid)+1.e-20); - }); - } // end if - - Kokkos::fence(); - elem_scalar_fields.update_host(); - elem_tensor_fields.update_host(); - - - // ************************ - // Build the nodal fields - // ************************ - - // save the nodal fields to an array for exporting to graphics files - DCArrayKokkos node_scalar_fields(num_node_scalar_vars, num_nodes, "node_scalars"); - DCArrayKokkos node_vector_fields(num_node_vector_vars, num_nodes, 3, "node_tenors"); - - concatenate_nodal_fields(State.node, - node_scalar_fields, - node_vector_fields, - SimulationParamaters.output_options.output_node_state, - dt, - num_nodes, - num_dims, - node_mass_id, - node_vel_id, - node_accel_id, - node_coord_id, - node_grad_level_set_id, - node_temp_id); - - - Kokkos::fence(); - node_scalar_fields.update_host(); - node_vector_fields.update_host(); - - - // ******************************** - // Write the nodal and elem fields - // ******************************** - - if (SimulationParamaters.output_options.format == output_options::viz || - SimulationParamaters.output_options.format == output_options::viz_and_state) { - - // create the folder structure if it does not exist - struct stat st; - - if (stat("vtk", &st) != 0) { - int returnCode = system("mkdir vtk"); - - if (returnCode == 1) { - std::cout << "Unable to make vtk directory" << std::endl; - } - } - else{ - if(solver_id==0 && graphics_id==0){ - // delete the existing files inside - int returnCode = system("rm vtk/Fierro*"); - if (returnCode == 1) { - std::cout << "Unable to clear vtk/Fierro directory" << std::endl; - } - } - } - - if (stat("vtk/data", &st) != 0) { - int returnCode = system("mkdir vtk/data"); - if (returnCode == 1) { - std::cout << "Unable to make vtk/data directory" << std::endl; - } - } - else{ - if(solver_id==0 && graphics_id==0){ - // delete the existing files inside the folder - int returnCode = system("rm vtk/data/Fierro*"); - if (returnCode == 1) { - std::cout << "Unable to clear vtk/data directory" << std::endl; - } - } - } - - // call the .vtu writer for element fields - std::string elem_fields_name = "fields"; - - // make a view of node coords for passing into functions - ViewCArray node_coords_host(&State.node.coords.host(0,0), num_nodes, num_dims); - ViewCArray nodes_in_elem_host(&mesh.nodes_in_elem.host(0,0), num_elems, num_nodes_in_elem); - - - write_vtu(node_coords_host, - nodes_in_elem_host, - elem_scalar_fields, - elem_tensor_fields, - node_scalar_fields, - node_vector_fields, - elem_scalar_var_names, - elem_tensor_var_names, - node_scalar_var_names, - node_vector_var_names, - elem_fields_name, - graphics_id, - num_nodes, - num_elems, - num_nodes_in_elem, - Pn_order, - num_dims, - solver_id); - - - // ******************************** - // Build and write the mat fields - // ******************************** - - - // note: the file path and folder was created in the elem and node outputs - size_t num_mat_files_written = 0; - if(num_mat_pt_scalar_vars > 0 || num_mat_pt_tensor_vars >0){ - - for (int mat_id = 0; mat_id < num_mats; mat_id++) { - - const size_t num_mat_elems = State.MaterialToMeshMaps.num_mat_elems.host(mat_id); - - // only save material data if the mat lives on the mesh, ie. has state allocated - if (num_mat_elems>0){ - - // set the nodal vars to zero size, we don't write these fields again - node_scalar_var_names.clear(); - node_vector_var_names.clear(); - - // the arrays storing all the material field data - DCArrayKokkos mat_elem_scalar_fields(num_mat_pt_scalar_vars, num_mat_elems, "mat_pt_scalars"); - DCArrayKokkos mat_elem_tensor_fields(num_mat_pt_tensor_vars, num_mat_elems, 3, 3, "mat_pt_tensors"); - - - // concatenate material fields into a single array - concatenate_mat_fields(State.MaterialPoints, - mat_elem_scalar_fields, - mat_elem_tensor_fields, - State.MaterialToMeshMaps.elem_in_mat_elem, - SimulationParamaters.output_options.output_mat_pt_state, - num_mat_elems, - mat_id, - mat_den_id, - mat_pres_id, - mat_sie_id, - mat_sspd_id, - mat_mass_id, - mat_volfrac_id, - mat_geo_volfrac_id, - mat_eroded_id, - mat_stress_id, - mat_conductivity_id, - mat_specific_heat_id); - Kokkos::fence(); - mat_elem_scalar_fields.update_host(); - mat_elem_tensor_fields.update_host(); - - - std::string str_mat_val = std::to_string(mat_id); - std::string mat_fields_name = "mat"; - mat_fields_name += str_mat_val; // add the mat number - - // save the nodes belonging to this part (i.e., the material) - DCArrayKokkos mat_node_coords(num_nodes,num_dims, "mat_node_coords"); - DCArrayKokkos mat_nodes_in_mat_elem(num_mat_elems, num_nodes_in_elem, "mat_nodes_in_mat_elem"); - - // the number of actual nodes belonging to the part (i.e., the material) - size_t num_mat_nodes = 0; - - // build a unique mesh (element and nodes) for the material (i.e., the part) - build_material_elem_node_lists(mesh, - State.node.coords, - mat_node_coords, - mat_nodes_in_mat_elem, - State.MaterialToMeshMaps.elem_in_mat_elem, - mat_id, - num_mat_nodes, - num_mat_elems, - num_nodes_in_elem, - num_dims); - - ViewCArray mat_node_coords_host(&mat_node_coords.host(0,0), num_mat_nodes, num_dims); - ViewCArray mat_nodes_in_elem_host(&mat_nodes_in_mat_elem.host(0,0), num_mat_elems, num_nodes_in_elem); - - // write out a vtu file this - write_vtu(mat_node_coords_host, - mat_nodes_in_elem_host, - mat_elem_scalar_fields, - mat_elem_tensor_fields, - node_scalar_fields, - node_vector_fields, - mat_elem_scalar_var_names, - mat_elem_tensor_var_names, - node_scalar_var_names, - node_vector_var_names, - mat_fields_name, - graphics_id, - num_mat_nodes, - num_mat_elems, - num_nodes_in_elem, - Pn_order, - num_dims, - solver_id); - - - num_mat_files_written++; - - } // end for mat_id - - } // end if material is on the mesh - - } // end if mat variables are to be written - - - // ************************************************* - // write Paraview files to open the graphics files - // ************************************************* - - // save the graphics time - graphics_times(graphics_id) = time_value; - - // check to see if an mesh state was written - bool write_mesh_state = false; - if( num_elem_scalar_vars > 0 || - num_elem_tensor_vars > 0 || - num_node_scalar_vars > 0 || - num_node_vector_vars > 0) - { - write_mesh_state = true; - } - - // check to see if a mat state was written - bool write_mat_pt_state = false; - if( num_mat_pt_scalar_vars > 0 || - num_mat_pt_tensor_vars > 0) - { - write_mat_pt_state = true; - } - - // call the vtm file writer - std::string mat_fields_name = "mat"; - write_vtm(graphics_times, - elem_fields_name, - mat_fields_name, - time_value, - graphics_id, - num_mat_files_written, - write_mesh_state, - write_mat_pt_state, - solver_id); - - // call the pvd file writer - write_pvd(graphics_times, - time_value, - graphics_id, - solver_id); - - - // increment graphics id counter - graphics_id++; // this is private variable in the class - - } // end if viz paraview output is to be written - - - // STATE - if (SimulationParamaters.output_options.format == output_options::state || - SimulationParamaters.output_options.format == output_options::viz_and_state) { - - write_material_point_state(mesh, - State, - SimulationParamaters, - time_value, - graphics_times, - node_states, - gauss_pt_states, - material_pt_states); - - } // end if state is to be written - - - // will drop ensight outputs in the near future - if (SimulationParamaters.output_options.format == output_options::ensight){ - write_ensight(mesh, - State, - SimulationParamaters, - dt, - time_value, - graphics_times, - node_states, - gauss_pt_states, - material_pt_states); - } - - return; - - } // end write_mesh - - ///////////////////////////////////////////////////////////////////////////// - /// - /// \fn write_ensight - /// - /// \brief Writes an ensight output file - /// - /// \param Simulation mesh - /// \param State data - /// \param Simulation parameters - /// \param current time value - /// \param Vector of all graphics output times - /// - ///////////////////////////////////////////////////////////////////////////// - void write_ensight(Mesh_t& mesh, - State_t& State, - SimulationParameters_t& SimulationParamaters, - double dt, - double time_value, - CArray graphics_times, - std::vector node_states, - std::vector gauss_pt_states, - std::vector material_pt_states) - { - size_t num_mats = State.MaterialPoints.num_material_points.size(); - - // ---- Update host data ---- - - // material point values - State.MaterialPoints.den.update_host(); - State.MaterialPoints.pres.update_host(); - State.MaterialPoints.stress.update_host(); - State.MaterialPoints.sspd.update_host(); - State.MaterialPoints.sie.update_host(); - State.MaterialPoints.mass.update_host(); - State.MaterialPoints.eroded.update_host(); - - - // gauss point values - State.GaussPoints.vol.update_host(); - - // nodal values - State.node.coords.update_host(); - State.node.vel.update_host(); - State.node.mass.update_host(); - - Kokkos::fence(); - - // -------------------------- - - const int num_scalar_vars = 10; - const int num_vec_vars = 3; - - std::string name_tmp; - name_tmp = "Outputs_SGH"; - - char* name = new char [name_tmp.length() + 1]; - std::strcpy(name, name_tmp.c_str()); - - const char scalar_var_names[num_scalar_vars][15] = { - "den", "pres", "sie", "vol", "mass", "sspd", "speed", "mat_id", "elem_switch", "eroded" - }; - - const char vec_var_names[num_vec_vars][15] = { - "pos", "vel", "accel" - }; - - // short hand - const size_t num_nodes = mesh.num_nodes; - const size_t num_elems = mesh.num_elems; - const size_t num_dims = mesh.num_dims; - - // save the cell state to an array for exporting to graphics files - auto elem_fields = CArray(num_elems, num_scalar_vars); - int elem_switch = 1; - - - DCArrayKokkos speed(num_elems, "speed"); - FOR_ALL(elem_gid, 0, num_elems, { - double elem_vel[3]; // note:initialization with a list won't work - elem_vel[0] = 0.0; - elem_vel[1] = 0.0; - elem_vel[2] = 0.0; - // get the coordinates of the element center - for (int node_lid = 0; node_lid < mesh.num_nodes_in_elem; node_lid++) { - elem_vel[0] += State.node.vel(mesh.nodes_in_elem(elem_gid, node_lid), 0); - elem_vel[1] += State.node.vel(mesh.nodes_in_elem(elem_gid, node_lid), 1); - if (mesh.num_dims == 3) { - elem_vel[2] += State.node.vel(mesh.nodes_in_elem(elem_gid, node_lid), 2); - } - else{ - elem_vel[2] = 0.0; - } - } // end loop over nodes in element - elem_vel[0] = elem_vel[0] / mesh.num_nodes_in_elem; - elem_vel[1] = elem_vel[1] / mesh.num_nodes_in_elem; - elem_vel[2] = elem_vel[2] / mesh.num_nodes_in_elem; - - double speed_sqrd = 0.0; - for (int dim = 0; dim < num_dims; dim++) { - speed_sqrd += elem_vel[dim] * elem_vel[dim]; - } - speed(elem_gid) = sqrt(speed_sqrd); - }); // end parallel for - speed.update_host(); - Kokkos::fence(); - - // save the output scale fields to a single 2D array - - // export material centeric data to the elements - for (int mat_id = 0; mat_id < num_mats; mat_id++) { - size_t num_mat_elems = State.MaterialToMeshMaps.num_mat_elems.host(mat_id); - - for (size_t mat_elem_sid = 0; mat_elem_sid < num_mat_elems; mat_elem_sid++) { - // 1 material per element - - // get elem gid - size_t elem_gid = State.MaterialToMeshMaps.elem_in_mat_elem.host(mat_id, mat_elem_sid); - - // save outputs - elem_fields(elem_gid, 0) = State.MaterialPoints.den.host(mat_id, mat_elem_sid); - elem_fields(elem_gid, 1) = State.MaterialPoints.pres.host(mat_id, mat_elem_sid); - elem_fields(elem_gid, 2) = State.MaterialPoints.sie.host(mat_id, mat_elem_sid); - // 3 is guass point vol - elem_fields(elem_gid, 4) = State.MaterialPoints.mass.host(mat_id, mat_elem_sid); - elem_fields(elem_gid, 5) = State.MaterialPoints.sspd.host(mat_id, mat_elem_sid); - // 6 is elem speed - elem_fields(elem_gid, 7) = (double)mat_id; - // 8 is the e_switch - elem_fields(elem_gid, 9) = (double)State.MaterialPoints.eroded.host(mat_id, mat_elem_sid); - } // end for mat elems storage - } // end parallel loop over materials - - // export element centric data - double e_switch = 1; - for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { - elem_fields(elem_gid, 3) = State.GaussPoints.vol.host(elem_gid); - elem_fields(elem_gid, 6) = speed.host(elem_gid); - elem_fields(elem_gid, 8) = e_switch; - elem_switch *= -1; - } // end for elem_gid - - // save the vertex vector fields to an array for exporting to graphics files - CArray vec_fields(num_nodes, num_vec_vars, 3); - - for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) { - // position, var 0 - vec_fields(node_gid, 0, 0) = State.node.coords.host(node_gid, 0); - vec_fields(node_gid, 0, 1) = State.node.coords.host(node_gid, 1); - if (num_dims == 2) { - vec_fields(node_gid, 0, 2) = 0.0; - } - else{ - vec_fields(node_gid, 0, 2) = State.node.coords.host(node_gid, 2); - } - - // velocity, var 1 - vec_fields(node_gid, 1, 0) = State.node.vel.host(node_gid, 0); - vec_fields(node_gid, 1, 1) = State.node.vel.host(node_gid, 1); - if (num_dims == 2) { - vec_fields(node_gid, 1, 2) = 0.0; - } - else{ - vec_fields(node_gid, 1, 2) = State.node.vel.host(node_gid, 2); - } - - // accelleration, var 2 - vec_fields(node_gid, 2, 0) = (State.node.vel.host(node_gid, 0) - State.node.vel_n0.host(node_gid, 0))/dt; - vec_fields(node_gid, 2, 1) = (State.node.vel.host(node_gid, 1) - State.node.vel_n0.host(node_gid, 1))/dt; - if (num_dims == 2) { - vec_fields(node_gid, 2, 2) = 0.0; - } - else{ - vec_fields(node_gid, 2, 2) = (State.node.vel.host(node_gid, 2) - State.node.vel_n0.host(node_gid, 2))/dt; - } - - - } // end for loop over vertices - - - // --------------------------------------------------------------------------- - // Setup of file and directoring for exporting - // --------------------------------------------------------------------------- - FILE* out[20]; // the output files that are written to - char filename[128]; - int max_len = sizeof filename; - int str_output_len; - - struct stat st; - - if (stat("ensight", &st) != 0) { - system("mkdir ensight"); - } - - if (stat("ensight/data", &st) != 0) { - system("mkdir ensight/data"); - } - - // --------------------------------------------------------------------------- - // Write the Geometry file - // --------------------------------------------------------------------------- - // sprintf(filename, "ensight/data/%s.%05d.geo", name, graphics_id); - str_output_len = snprintf(filename, max_len, "ensight/data/%s.%05d.geo", name, graphics_id); - // filename has the full string - if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); } - - out[0] = fopen(filename, "w"); - - fprintf(out[0], "A graphics dump by Fierro \n"); - - fprintf(out[0], "%s", "EnSight Gold geometry\n"); - fprintf(out[0], "%s", "node id assign\n"); - fprintf(out[0], "%s", "element id assign\n"); - - fprintf(out[0], "part\n"); - fprintf(out[0], "%10d\n", 1); - fprintf(out[0], "Mesh\n"); - - // --- vertices --- - fprintf(out[0], "coordinates\n"); - fprintf(out[0], "%10lu\n", num_nodes); - - // write all components of the point coordinates - for (int node_gid = 0; node_gid < num_nodes; node_gid++) { - fprintf(out[0], "%12.5e\n", State.node.coords.host(node_gid, 0)); - } - - for (int node_gid = 0; node_gid < num_nodes; node_gid++) { - fprintf(out[0], "%12.5e\n", State.node.coords.host(node_gid, 1)); - } - - for (int node_gid = 0; node_gid < num_nodes; node_gid++) { - if (num_dims == 3) { - fprintf(out[0], "%12.5e\n", State.node.coords.host(node_gid, 2)); - } - else{ - fprintf(out[0], "%12.5e\n", 0.0); - } - } - - // --- elements --- - if (num_dims == 3) { - fprintf(out[0], "hexa8\n"); - } - else{ - fprintf(out[0], "quad4\n"); - } - fprintf(out[0], "%10lu\n", num_elems); - - - int convert_ijk_to_ensight[8]; - if(mesh.num_dims==3){ - convert_ijk_to_ensight[0] = 0; - convert_ijk_to_ensight[1] = 1; - convert_ijk_to_ensight[2] = 3; - convert_ijk_to_ensight[3] = 2; - convert_ijk_to_ensight[4] = 4; - convert_ijk_to_ensight[5] = 5; - convert_ijk_to_ensight[6] = 7; - convert_ijk_to_ensight[7] = 6; - } - else{ - - convert_ijk_to_ensight[0] = 0; - convert_ijk_to_ensight[1] = 1; - convert_ijk_to_ensight[2] = 2; - convert_ijk_to_ensight[3] = 3; - convert_ijk_to_ensight[4] = 4; - convert_ijk_to_ensight[5] = 5; - convert_ijk_to_ensight[6] = 6; - convert_ijk_to_ensight[7] = 7; - } // end if - - - // write all global point numbers for this cell - for (int elem_gid = 0; elem_gid < num_elems; elem_gid++) { - for (int node_lid = 0; node_lid < mesh.num_nodes_in_elem; node_lid++) { - fprintf(out[0], "%10lu\t", mesh.nodes_in_elem.host(elem_gid, convert_ijk_to_ensight[node_lid]) + 1); // note: node_gid starts at 1 - } - fprintf(out[0], "\n"); - } - - fclose(out[0]); - - // --------------------------------------------------------------------------- - // Write the Scalar variable files - // --------------------------------------------------------------------------- - - // ensight_vars = (den, pres,...) - for (int var = 0; var < num_scalar_vars; var++) { - // write a scalar value - // sprintf(filename, "ensight/data/%s.%05d.%s", name, graphics_id, scalar_var_names[var]); - str_output_len = snprintf(filename, max_len, "ensight/data/%s.%05d.%s", name, graphics_id, scalar_var_names[var]); - if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); } - - out[0] = fopen(filename, "w"); - - fprintf(out[0], "Per_elem scalar values\n"); - fprintf(out[0], "part\n"); - fprintf(out[0], "%10d\n", 1); - if (num_dims == 3) { - fprintf(out[0], "hexa8\n"); - } - else{ - fprintf(out[0], "quad4\n"); - } - - for (int elem_id = 0; elem_id < num_elems; elem_id++) { - fprintf(out[0], "%12.5e\n", elem_fields(elem_id, var)); - } - - fclose(out[0]); - } // end for var - - // --------------------------------------------------------------------------- - // Write the Vector variable files - // --------------------------------------------------------------------------- - - // ensight vector vars = (position, velocity, force) - for (int var = 0; var < num_vec_vars; var++) { - // sprintf(filename, "ensight/data/%s.%05d.%s", name, graphics_id, vec_var_names[var]); - str_output_len = snprintf(filename, max_len, "ensight/data/%s.%05d.%s", name, graphics_id, vec_var_names[var]); - if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); } - - out[0] = fopen(filename, "w"); - // fprintf(out[0],"Per_node vector values\n"); - // fprintf(out[0],"part\n"); - // fprintf(out[0],"%10d \n",1); - // fprintf(out[0],"hexa8\n"); // WARNING, maybe bug here? - - fprintf(out[0], "Per_node vector values\n"); - fprintf(out[0], "part\n"); - fprintf(out[0], "%10d\n", 1); - fprintf(out[0], "block\n"); - - for (int node_gid = 0; node_gid < num_nodes; node_gid++) { - fprintf(out[0], "%12.5e\n", vec_fields(node_gid, var, 0)); - } - - for (int node_gid = 0; node_gid < num_nodes; node_gid++) { - fprintf(out[0], "%12.5e\n", vec_fields(node_gid, var, 1)); - } - - for (int node_gid = 0; node_gid < num_nodes; node_gid++) { - fprintf(out[0], "%12.5e\n", vec_fields(node_gid, var, 2)); - } - - fclose(out[0]); - } // end for var - - // --------------------------------------------------------------------------- - // Write the case file - // --------------------------------------------------------------------------- - - // sprintf(filename, "ensight/%s.case", name); - str_output_len = snprintf(filename, max_len, "ensight/%s.case", name); - if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); } - - out[0] = fopen(filename, "w"); - - fprintf(out[0], "FORMAT\n"); - fprintf(out[0], "type: ensight gold\n"); - fprintf(out[0], "GEOMETRY\n"); - - // sprintf(filename, "model: data/%s.*****.geo\n", name); - str_output_len = snprintf(filename, max_len, "model: data/%s.*****.geo\n", name); - if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); } - - fprintf(out[0], "%s", filename); - fprintf(out[0], "VARIABLE\n"); - - for (int var = 0; var < num_scalar_vars; var++) { - // sprintf(filename, "scalar per element: %s data/%s.*****.%s\n", scalar_var_names[var], name, scalar_var_names[var]); - str_output_len = snprintf(filename, max_len, "scalar per element: %s data/%s.*****.%s\n", scalar_var_names[var], name, scalar_var_names[var]); - if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); } - - fprintf(out[0], "%s", filename); - } - - for (int var = 0; var < num_vec_vars; var++) { - // sprintf(filename, "vector per node: %s data/%s.*****.%s\n", vec_var_names[var], name, vec_var_names[var]); - str_output_len = snprintf(filename, max_len, "vector per node: %s data/%s.*****.%s\n", vec_var_names[var], name, vec_var_names[var]); - if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); } - fprintf(out[0], "%s", filename); - } - - fprintf(out[0], "TIME\n"); - fprintf(out[0], "time set: 1\n"); - fprintf(out[0], "number of steps: %4d\n", graphics_id + 1); - fprintf(out[0], "filename start number: 0\n"); - fprintf(out[0], "filename increment: 1\n"); - fprintf(out[0], "time values: \n"); - - graphics_times(graphics_id) = time_value; - - for (int i = 0; i <= graphics_id; i++) { - fprintf(out[0], "%12.5e\n", graphics_times(i)); - } - fclose(out[0]); - - // --------------------------------------------------------------------------- - // Done writing the graphics dump - // --------------------------------------------------------------------------- - - // increment graphics id counter - graphics_id++; - - delete[] name; - - - return; - } - - ///////////////////////////////////////////////////////////////////////////// - /// - /// \fn write_vtk_old - /// - /// \brief Writes a vtk output file - /// - /// \param Simulation mesh - /// \param State data - /// \param Simulation parameters - /// \param current time value - /// \param Vector of all graphics output times - /// - ///////////////////////////////////////////////////////////////////////////// - void write_vtk_old(Mesh_t& mesh, - State_t& State, - SimulationParameters_t& SimulationParamaters, - double dt, - double time_value, - CArray graphics_times, - std::vector node_states, - std::vector gauss_pt_states, - std::vector material_pt_states) - { - - size_t num_mats = State.MaterialPoints.num_material_points.size(); - - // ---- Update host data ---- - - // material point values - State.MaterialPoints.den.update_host(); - State.MaterialPoints.pres.update_host(); - State.MaterialPoints.stress.update_host(); - State.MaterialPoints.sspd.update_host(); - State.MaterialPoints.sie.update_host(); - State.MaterialPoints.mass.update_host(); - State.MaterialPoints.conductivity.update_host(); - State.MaterialPoints.temp_grad.update_host(); - State.MaterialPoints.eroded.update_host(); - - - // gauss point values - State.GaussPoints.vol.update_host(); - - // nodal values - State.node.coords.update_host(); - State.node.vel.update_host(); - State.node.mass.update_host(); - State.node.temp.update_host(); - - Kokkos::fence(); - - - const int num_cell_scalar_vars = 13; - const int num_cell_vec_vars = 0; - const int num_cell_tensor_vars = 0; - - const int num_point_scalar_vars = 1; - const int num_point_vec_vars = 2; - - - // Scalar values associated with a cell - const char cell_scalar_var_names[num_cell_scalar_vars][15] = { - "den", "pres", "sie", "vol", "mass", "sspd", "speed", "mat_id", "elem_switch","eroded", "temp_grad_x", "temp_grad_y", "temp_grad_z" - }; - - const char cell_vec_var_names[num_cell_vec_vars][15] = { - - }; - - const char point_scalar_var_names[num_point_scalar_vars][15] = { - "temp" - }; - - const char point_vec_var_names[num_point_vec_vars][15] = { - "pos", "vel" - }; - - // short hand - const size_t num_nodes = mesh.num_nodes; - const size_t num_elems = mesh.num_elems; - const size_t num_dims = mesh.num_dims; - - // save the cell state to an array for exporting to graphics files - auto elem_fields = CArray(num_elems, num_cell_scalar_vars); - int elem_switch = 1; - - DCArrayKokkos speed(num_elems, "speed"); - FOR_ALL(elem_gid, 0, num_elems, { - double elem_vel[3]; // note:initialization with a list won't work - elem_vel[0] = 0.0; - elem_vel[1] = 0.0; - elem_vel[2] = 0.0; - // get the coordinates of the element center - for (int node_lid = 0; node_lid < mesh.num_nodes_in_elem; node_lid++) { - elem_vel[0] += State.node.vel(mesh.nodes_in_elem(elem_gid, node_lid), 0); - elem_vel[1] += State.node.vel(mesh.nodes_in_elem(elem_gid, node_lid), 1); - if (mesh.num_dims == 3) { - elem_vel[2] += State.node.vel(mesh.nodes_in_elem(elem_gid, node_lid), 2); - } - else{ - elem_vel[2] = 0.0; - } - } // end loop over nodes in element - elem_vel[0] = elem_vel[0] / mesh.num_nodes_in_elem; - elem_vel[1] = elem_vel[1] / mesh.num_nodes_in_elem; - elem_vel[2] = elem_vel[2] / mesh.num_nodes_in_elem; - - double speed_sqrd = 0.0; - for (int dim = 0; dim < num_dims; dim++) { - speed_sqrd += elem_vel[dim] * elem_vel[dim]; - } - speed(elem_gid) = sqrt(speed_sqrd); - }); // end parallel for - speed.update_host(); - Kokkos::fence(); - - // save the output scale fields to a single 2D array - - - // export material centeric data to the elements - for (int mat_id = 0; mat_id < num_mats; mat_id++) { - size_t num_mat_elems = State.MaterialToMeshMaps.num_mat_elems.host(mat_id); - - for (size_t mat_elem_sid = 0; mat_elem_sid < num_mat_elems; mat_elem_sid++) { - // 1 material per element - - // get elem gid - size_t elem_gid = State.MaterialToMeshMaps.elem_in_mat_elem.host(mat_id, mat_elem_sid); - - // save outputs - elem_fields(elem_gid, 0) = State.MaterialPoints.den.host(mat_id,mat_elem_sid); - elem_fields(elem_gid, 1) = State.MaterialPoints.pres.host(mat_id, mat_elem_sid); - elem_fields(elem_gid, 2) = State.MaterialPoints.sie.host(mat_id, mat_elem_sid); - // 3 is guass point vol - elem_fields(elem_gid, 4) = State.MaterialPoints.mass.host(mat_id, mat_elem_sid); - elem_fields(elem_gid, 5) = State.MaterialPoints.sspd.host(mat_id, mat_elem_sid); - // 6 is elem speed - elem_fields(elem_gid, 7) = (double)mat_id; - // 8 is the e_switch - elem_fields(elem_gid, 9) = (double)State.MaterialPoints.eroded.host(mat_id, mat_elem_sid); - elem_fields(elem_gid, 10) = (double)State.MaterialPoints.temp_grad.host(mat_id, elem_gid,0); - elem_fields(elem_gid, 11) = (double)State.MaterialPoints.temp_grad.host(mat_id, elem_gid,1); - elem_fields(elem_gid, 12) = (double)State.MaterialPoints.temp_grad.host(mat_id, elem_gid,2); - } // end for mat elems storage - } // end parallel loop over materials - - // export element centric data - double e_switch = 1; - for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { - elem_fields(elem_gid, 3) = State.GaussPoints.vol.host(elem_gid); - elem_fields(elem_gid, 6) = speed.host(elem_gid); - elem_fields(elem_gid, 8) = State.GaussPoints.div.host(elem_gid); - elem_switch *= -1; - } // end for elem_gid - - // save the vertex vector fields to an array for exporting to graphics files - CArray vec_fields(num_nodes, num_point_vec_vars, 3); - CArray point_scalar_fields(num_nodes, num_point_scalar_vars); - - for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) { - // position, var 0 - vec_fields(node_gid, 0, 0) = State.node.coords.host(node_gid, 0); - vec_fields(node_gid, 0, 1) = State.node.coords.host(node_gid, 1); - if (num_dims == 2) { - vec_fields(node_gid, 0, 2) = 0.0; - } - else{ - vec_fields(node_gid, 0, 2) = State.node.coords.host(node_gid, 2); - } - - // position, var 1 - vec_fields(node_gid, 1, 0) = State.node.vel.host(node_gid, 0); - vec_fields(node_gid, 1, 1) = State.node.vel.host(node_gid, 1); - if (num_dims == 2) { - vec_fields(node_gid, 1, 2) = 0.0; - } - else{ - vec_fields(node_gid, 1, 2) = State.node.vel.host(node_gid, 2); - } - - point_scalar_fields(node_gid, 0) = State.node.temp.host(node_gid); - } // end for loop over vertices - - - FILE* out[20]; // the output files that are written to - char filename[100]; // char string - int max_len = sizeof filename; - int str_output_len; - - struct stat st; - - if (stat("vtk", &st) != 0) { - system("mkdir vtk"); - } - - // snprintf(filename, max_len, "ensight/data/%s.%05d.%s", name, graphics_id, vec_var_names[var]); - - //sprintf(filename, "vtk/Fierro.%05d.vtk", graphics_id); // mesh file - str_output_len = snprintf(filename, max_len, "vtk/Fierro.%05d.vtk", graphics_id); - if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); } - // mesh file - - out[0] = fopen(filename, "w"); - - fprintf(out[0], "# vtk DataFile Version 2.0\n"); // part 2 - fprintf(out[0], "Mesh for Fierro\n"); // part 2 - fprintf(out[0], "ASCII \n"); // part 3 - fprintf(out[0], "DATASET UNSTRUCTURED_GRID\n\n"); // part 4 - - fprintf(out[0], "POINTS %zu float\n", mesh.num_nodes); - - // write all components of the point coordinates - for (size_t node_gid = 0; node_gid < mesh.num_nodes; node_gid++) { - fprintf(out[0], - "%f %f %f\n", - State.node.coords.host(node_gid, 0), - State.node.coords.host(node_gid, 1), - State.node.coords.host(node_gid, 2)); - } // end for - - /* - --------------------------------------------------------------------------- - Write the elems - --------------------------------------------------------------------------- - */ - - fprintf(out[0], "\n"); - fprintf(out[0], "CELLS %lu %lu\n", mesh.num_elems, mesh.num_elems + mesh.num_elems * mesh.num_nodes_in_elem); // size=all printed values - - int Pn_order = mesh.Pn; - int order[3] = { Pn_order, Pn_order, Pn_order }; - - // const int num_1D_points = Pn_order+1; - - // write all global point numbers for this elem - for (size_t elem_gid = 0; elem_gid < mesh.num_elems; elem_gid++) { - fprintf(out[0], "%lu ", mesh.num_nodes_in_elem); // num points in this elem - - for (int k = 0; k <= Pn_order; k++) { - for (int j = 0; j <= Pn_order; j++) { - for (int i = 0; i <= Pn_order; i++) { - size_t node_lid = PointIndexFromIJK(i, j, k, order); - fprintf(out[0], "%lu ", mesh.nodes_in_elem.host(elem_gid, node_lid)); - } - } - } - - fprintf(out[0], "\n"); - } // end for - - // Write the element types - fprintf(out[0], "\n"); - fprintf(out[0], "CELL_TYPES %zu \n", mesh.num_elems); - // VTK_LAGRANGE_HEXAHEDRON: 72, - // VTK_HIGHER_ORDER_HEXAHEDRON: 67 - // VTK_BIQUADRATIC_QUADRATIC_HEXAHEDRON = 33 - // element types: https://vtk.org/doc/nightly/html/vtkCellType_8h_source.html - // element types: https://kitware.github.io/vtk-js/api/Common_DataModel_CellTypes.html - // vtk format: https://www.kitware.com//modeling-arbitrary-order-lagrange-finite-elements-in-the-visualization-toolkit/ - for (size_t elem_gid = 0; elem_gid < mesh.num_elems; elem_gid++) { - fprintf(out[0], "%d \n", 72); - } - - /* - --------------------------------------------------------------------------- - Write the nodal vector variables to file - --------------------------------------------------------------------------- - */ - - fprintf(out[0], "\n"); - fprintf(out[0], "POINT_DATA %zu \n", mesh.num_nodes); - - // vtk vector vars = (position, velocity) - for (int var = 0; var < num_point_vec_vars; var++) { - fprintf(out[0], "VECTORS %s float \n", point_vec_var_names[var]); - for (size_t node_gid = 0; node_gid < mesh.num_nodes; node_gid++) { - fprintf(out[0], "%f %f %f\n", - vec_fields(node_gid, var, 0), - vec_fields(node_gid, var, 1), - vec_fields(node_gid, var, 2)); - } // end for nodes - } // end for vec_vars - - - // vtk scalar vars = (temp) - for (int var = 0; var < num_point_scalar_vars; var++) { - fprintf(out[0], "SCALARS %s float 1\n", point_scalar_var_names[var]); - fprintf(out[0], "LOOKUP_TABLE default\n"); - for (size_t node_gid = 0; node_gid < mesh.num_nodes; node_gid++) { - fprintf(out[0], "%f\n", - point_scalar_fields(node_gid, 0)); - } // end for nodes - } // end for vec_vars - - /* - --------------------------------------------------------------------------- - Write the scalar elem variable to file - --------------------------------------------------------------------------- - */ - fprintf(out[0], "\n"); - fprintf(out[0], "CELL_DATA %zu \n", mesh.num_elems); - - for (int var = 0; var < num_cell_scalar_vars; var++) { - fprintf(out[0], "SCALARS %s float 1\n", cell_scalar_var_names[var]); // the 1 is number of scalar components [1:4] - fprintf(out[0], "LOOKUP_TABLE default\n"); - for (size_t elem_gid = 0; elem_gid < mesh.num_elems; elem_gid++) { - fprintf(out[0], "%f\n", elem_fields(elem_gid, var)); - } // end for elem - } // end for cell scalar_vars - - fclose(out[0]); - - graphics_times(graphics_id) = time_value; - - // Write time series metadata - //sprintf(filename, "vtk/Fierro.vtk.series", graphics_id); // mesh file - str_output_len = snprintf(filename, max_len, "vtk/Fierro.vtk.series"); - if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); } - // mesh file - - out[0] = fopen(filename, "w"); - - fprintf(out[0], "{\n"); - fprintf(out[0], " \"file-series-version\" : \"1.0\",\n"); - fprintf(out[0], " \"files\" : [\n"); - - for (int i = 0; i <= graphics_id; i++) { - fprintf(out[0], " { \"name\" : \"Fierro.%05d.vtk\", \"time\" : %12.5e },\n", i, graphics_times(i) ); - } - - // fprintf(out[0], "%12.5e\n", graphics_times(i)); - fprintf(out[0], " ]\n"); // part 4 - fprintf(out[0], "}"); // part 4 - - fclose(out[0]); - - // increment graphics id counter - graphics_id++; - - - } // end write vtk old - - - ///////////////////////////////////////////////////////////////////////////// - /// - /// \fn concatenate_elem_fields - /// - /// \brief A function to calculate the average of elem fields and concatentate into 1 array - /// - /// - /// \param MaterialPoints a struct containing the material point state arrays - /// \param elem_scalar_fields the scalar fields - /// \param elem_tensor_fields the tensor fields - /// \param elem_in_mat_elem a listing of the element ids the material resides in - /// \param output_elem_state a std::vector of enums specifying the elem avg outputs - /// \param num_mat_elems the number of elements the material resides in - /// \param mat_id the index for the material - /// - ///////////////////////////////////////////////////////////////////////////// - void concatenate_elem_fields(const MaterialPoint_t& MaterialPoints, - const GaussPoint_t& GaussPoints, - DCArrayKokkos& elem_scalar_fields, - DCArrayKokkos& elem_tensor_fields, - const DRaggedRightArrayKokkos& elem_in_mat_elem, - const std::vector& output_elem_state, - const std::vector& output_gauss_pt_states, - const size_t num_mat_elems, - const size_t mat_id, - const size_t num_elems, - const int den_id, - const int pres_id, - const int sie_id, - const int sspd_id, - const int mass_id, - const int stress_id, - const int vol_id, - const int div_id, - const int level_set_id, - const int vel_grad_id, - const int conductivity_id, - const int specific_heat_id) - { - - // --- loop over the material point states - - for (auto field : output_elem_state){ - switch(field){ - // scalar vars - case material_pt_state::density: - FOR_ALL(mat_elem_sid, 0, num_mat_elems, { - - // get elem gid - size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid); - - // field - elem_scalar_fields(den_id, elem_gid) += MaterialPoints.den(mat_id, mat_elem_sid)* - MaterialPoints.volfrac(mat_id, mat_elem_sid)* - MaterialPoints.geo_volfrac(mat_id, mat_elem_sid); - }); - break; - case material_pt_state::pressure: - FOR_ALL(mat_elem_sid, 0, num_mat_elems, { - - // get elem gid - size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid); - - // field - elem_scalar_fields(pres_id, elem_gid) += MaterialPoints.pres(mat_id, mat_elem_sid)* - MaterialPoints.volfrac(mat_id, mat_elem_sid)* - MaterialPoints.geo_volfrac(mat_id, mat_elem_sid); - }); - break; - case material_pt_state::specific_internal_energy: - FOR_ALL(mat_elem_sid, 0, num_mat_elems, { - - // get elem gid - size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid); - - // field - // extensive ie here, but after this function, it will become specific ie - elem_scalar_fields(sie_id, elem_gid) += MaterialPoints.mass(mat_id, mat_elem_sid)* - MaterialPoints.sie(mat_id, mat_elem_sid); - }); - break; - case material_pt_state::sound_speed: - FOR_ALL(mat_elem_sid, 0, num_mat_elems, { - - // get elem gid - size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid); - - // field - elem_scalar_fields(sspd_id, elem_gid) += MaterialPoints.sspd(mat_id, mat_elem_sid)* - MaterialPoints.volfrac(mat_id, mat_elem_sid)* - MaterialPoints.geo_volfrac(mat_id, mat_elem_sid); - }); - break; - case material_pt_state::mass: - FOR_ALL(mat_elem_sid, 0, num_mat_elems, { - - // get elem gid - size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid); - - // field - elem_scalar_fields(mass_id, elem_gid) += MaterialPoints.mass(mat_id, mat_elem_sid); - }); - break; - // --------------- - // tensor vars - // --------------- - case material_pt_state::stress: - FOR_ALL(mat_elem_sid, 0, num_mat_elems, { - - // get elem gid - size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid); - - // field - // average tensor fields, it is always 3D - // note: paraview is row-major, CArray convention - for (size_t i=0; i<3; i++){ - for(size_t j=0; j<3; j++){ - - // stress tensor - elem_tensor_fields(stress_id, elem_gid, i, j) += - MaterialPoints.stress(mat_id, mat_elem_sid,i,j) * - MaterialPoints.volfrac(mat_id, mat_elem_sid)* - MaterialPoints.geo_volfrac(mat_id, mat_elem_sid); - } // end for - } // end for - }); - break; - - // thermal solver vars - case material_pt_state::thermal_conductivity: - FOR_ALL(mat_elem_sid, 0, num_mat_elems, { - - // get elem gid - size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid); - - // field - elem_scalar_fields(conductivity_id, elem_gid) += MaterialPoints.conductivity(mat_id, mat_elem_sid)* - MaterialPoints.volfrac(mat_id, mat_elem_sid)* - MaterialPoints.geo_volfrac(mat_id, mat_elem_sid); - }); - break; - - case material_pt_state::specific_heat: - FOR_ALL(mat_elem_sid, 0, num_mat_elems, { - - // get elem gid - size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid); - - // field - elem_scalar_fields(specific_heat_id, elem_gid) += MaterialPoints.specific_heat(mat_id, mat_elem_sid)* - MaterialPoints.volfrac(mat_id, mat_elem_sid)* - MaterialPoints.geo_volfrac(mat_id, mat_elem_sid); - }); - break; - - - // add other variables here - - // not used variables - case material_pt_state::volume_fraction: - break; - case material_pt_state::eroded_flag: - break; - case material_pt_state::elastic_modulii: - break; - case material_pt_state::shear_modulii: - break; - case material_pt_state::poisson_ratios: - break; - case material_pt_state::heat_flux: - break; - } // end switch - }// end for over mat point state - - - // --- add loop over gauss points --- - - // export element centric data - for (auto field : output_gauss_pt_states){ - switch(field){ - // scalars - case gauss_pt_state::volume: - - FOR_ALL(elem_gid, 0, num_elems, { - elem_scalar_fields(vol_id, elem_gid) = GaussPoints.vol(elem_gid); - }); - - break; - case gauss_pt_state::divergence_velocity: - - FOR_ALL(elem_gid, 0, num_elems, { - elem_scalar_fields(div_id, elem_gid) = GaussPoints.div(elem_gid); - }); - - break; - - case gauss_pt_state::level_set: - - FOR_ALL(elem_gid, 0, num_elems, { - elem_scalar_fields(level_set_id, elem_gid) = GaussPoints.level_set(elem_gid); - }); - - break; - - // tensors - case gauss_pt_state::gradient_velocity: - // note: paraview is row-major, CArray convention - FOR_ALL(elem_gid, 0, num_elems, { - for (size_t i=0; i<3; i++){ - for(size_t j=0; j<3; j++){ - elem_tensor_fields(vel_grad_id, elem_gid, i, j) = - GaussPoints.vel_grad(elem_gid, i, j); - } - } // end for - }); - - break; - - // add other gauss variables here - - } // end switch - } // end loop over gauss_pt_states - - - // --- add end gauss point loop -- - - } // end of function - - ///////////////////////////////////////////////////////////////////////////// - /// - /// \fn concatenate_mat_fields - /// - /// \brief A function to concatentate material fields into 1 array - /// - /// - /// \param MaterialPoints a struct containing the material point state arrays - /// \param elem_scalar_fields the scalar fields - /// \param elem_tensor_fields the tensor fields - /// \param elem_in_mat_elem a listing of the element ids the material resides in - /// \param output_material_pt_states a std::vector of enums specifying the model - /// \param num_mat_elems the number of elements the material resides in - /// \param mat_id the index for the material - /// - ///////////////////////////////////////////////////////////////////////////// - void concatenate_mat_fields(const MaterialPoint_t& MaterialPoints, - DCArrayKokkos& mat_elem_scalar_fields, - DCArrayKokkos& mat_elem_tensor_fields, - const DRaggedRightArrayKokkos& elem_in_mat_elem, - const std::vector& output_material_pt_states, - const size_t num_mat_elems, - const size_t mat_id, - const int mat_den_id, - const int mat_pres_id, - const int mat_sie_id, - const int mat_sspd_id, - const int mat_mass_id, - const int mat_volfrac_id, - const int mat_geo_volfrac_id, - const int mat_eroded_id, - const int mat_stress_id, - const int mat_conductivity_id, - const int mat_specific_heat_id) - { - - // --- loop over the material point states - - for (auto field : output_material_pt_states){ - switch(field){ - // scalar vars - case material_pt_state::density: - FOR_ALL(mat_elem_sid, 0, num_mat_elems, { - - // field - mat_elem_scalar_fields(mat_den_id, mat_elem_sid) = MaterialPoints.den(mat_id, mat_elem_sid); - }); - break; - case material_pt_state::pressure: - FOR_ALL(mat_elem_sid, 0, num_mat_elems, { - - // field - mat_elem_scalar_fields(mat_pres_id, mat_elem_sid) = MaterialPoints.pres(mat_id, mat_elem_sid); - }); - break; - case material_pt_state::specific_internal_energy: - FOR_ALL(mat_elem_sid, 0, num_mat_elems, { - - // field - // extensive ie here, but after this function, it will become specific ie - mat_elem_scalar_fields(mat_sie_id, mat_elem_sid) = MaterialPoints.sie(mat_id, mat_elem_sid); - }); - break; - case material_pt_state::sound_speed: - FOR_ALL(mat_elem_sid, 0, num_mat_elems, { - - // field - mat_elem_scalar_fields(mat_sspd_id, mat_elem_sid) = MaterialPoints.sspd(mat_id, mat_elem_sid); - }); - break; - case material_pt_state::mass: - FOR_ALL(mat_elem_sid, 0, num_mat_elems, { - - // field - mat_elem_scalar_fields(mat_mass_id, mat_elem_sid) = MaterialPoints.mass(mat_id, mat_elem_sid); - }); - break; - case material_pt_state::volume_fraction: - // material volume fraction - FOR_ALL(mat_elem_sid, 0, num_mat_elems, { - - // field - // this is the volume fraction of a material within a part - mat_elem_scalar_fields(mat_volfrac_id, mat_elem_sid) = MaterialPoints.volfrac(mat_id, mat_elem_sid); - }); - - // geometric volume fraction - FOR_ALL(mat_elem_sid, 0, num_mat_elems, { - - // field - // this is the geometric volume fraction (interface reconstruction) - mat_elem_scalar_fields(mat_geo_volfrac_id, mat_elem_sid) = MaterialPoints.geo_volfrac(mat_id, mat_elem_sid); - }); - break; - case material_pt_state::eroded_flag: - FOR_ALL(mat_elem_sid, 0, num_mat_elems, { - - // field - mat_elem_scalar_fields(mat_eroded_id, mat_elem_sid) = (double)MaterialPoints.eroded(mat_id, mat_elem_sid); - }); - break; - // --------------- - // tensor vars - // --------------- - case material_pt_state::stress: - FOR_ALL(mat_elem_sid, 0, num_mat_elems, { - - // field - // average tensor fields, it is always 3D - // note: paraview is row-major, CArray convention - for (size_t i=0; i<3; i++){ - for(size_t j=0; j<3; j++){ - - // stress tensor - mat_elem_tensor_fields(mat_stress_id, mat_elem_sid, i, j) = - MaterialPoints.stress(mat_id, mat_elem_sid,i,j); - } // end for - } // end for - }); - break; - - // thermal solver vars - case material_pt_state::thermal_conductivity: - FOR_ALL(mat_elem_sid, 0, num_mat_elems, { - - // get elem gid - size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid); - - // field - mat_elem_scalar_fields(mat_conductivity_id, elem_gid) += MaterialPoints.conductivity(mat_id, mat_elem_sid); - }); - break; - - case material_pt_state::specific_heat: - FOR_ALL(mat_elem_sid, 0, num_mat_elems, { - - // get elem gid - size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid); - - // field - mat_elem_scalar_fields(mat_specific_heat_id, elem_gid) += MaterialPoints.specific_heat(mat_id, mat_elem_sid); - }); - break; - - // add other variables here - - // not used variables - case material_pt_state::elastic_modulii: - break; - case material_pt_state::shear_modulii: - break; - case material_pt_state::poisson_ratios: - break; - case material_pt_state::heat_flux: - break; - } // end switch - }// end for over mat point state - - - } // end of function - - ///////////////////////////////////////////////////////////////////////////// - /// - /// \fn concatenate_nodal_fields - /// - /// \brief A function to calculate the average of elem fields - /// - /// - /// \param Node a struct containing the material point state arrays - /// \param elem_scalar_fields the scalar fields - /// \param elem_tensor_fields the tensor fields - /// \param elem_in_mat_elem a listing of the element ids the material resides in - /// \param output_node_states a std::vector of enums specifying the model - /// \param num_mat_elems the number of elements the material resides in - /// \param mat_id the index for the material - /// - ///////////////////////////////////////////////////////////////////////////// - void concatenate_nodal_fields(const node_t& Node, - DCArrayKokkos& node_scalar_fields, - DCArrayKokkos& node_vector_fields, - std::vector& output_node_states, - double dt, - const size_t num_nodes, - const size_t num_dims, - const int node_mass_id, - const int node_vel_id, - const int node_accel_id, - const int node_coord_id, - const int node_grad_level_set_id, - const int node_temp_id) - { - for (auto field : output_node_states){ - switch(field){ - // scalars - case node_state::mass: - - FOR_ALL(node_gid, 0, num_nodes, { - node_scalar_fields(node_mass_id, node_gid) = Node.mass(node_gid); - }); - - break; - case node_state::temp: - FOR_ALL(node_gid, 0, num_nodes, { - node_scalar_fields(node_temp_id, node_gid) = Node.temp(node_gid); - }); - - break; - - // vector fields - - case node_state::coords: - - FOR_ALL(node_gid, 0, num_nodes, { - - node_vector_fields(node_coord_id, node_gid, 0) = Node.coords(node_gid, 0); - node_vector_fields(node_coord_id, node_gid, 1) = Node.coords(node_gid, 1); - if (num_dims == 2) { - node_vector_fields(node_coord_id, node_gid, 2) = 0.0; - } - else{ - node_vector_fields(node_coord_id, node_coord_id, 2) = Node.coords(node_gid, 2); - } // end if - - }); // end parallel for - - break; - case node_state::velocity: - - FOR_ALL(node_gid, 0, num_nodes, { - - // velocity, var is node_vel_id - node_vector_fields(node_vel_id, node_gid, 0) = Node.vel(node_gid, 0); - node_vector_fields(node_vel_id, node_gid, 1) = Node.vel(node_gid, 1); - if (num_dims == 2) { - node_vector_fields(node_vel_id, node_gid, 2) = 0.0; - } - else{ - node_vector_fields(node_vel_id, node_gid, 2) = Node.vel(node_gid, 2); - } // end if - - // accellerate, var is node_accel_id - node_vector_fields(node_accel_id, node_gid, 0) = (Node.vel(node_gid, 0) - Node.vel_n0(node_gid, 0))/dt; - node_vector_fields(node_accel_id, node_gid, 1) = (Node.vel(node_gid, 1) - Node.vel_n0(node_gid, 1))/dt; - if (num_dims == 2) { - node_vector_fields(node_accel_id, node_gid, 2) = 0.0; - } - else{ - node_vector_fields(node_accel_id, node_gid, 2) = (Node.vel(node_gid, 2) - Node.vel_n0(node_gid, 2))/dt; - } // end if - - }); // end parallel for - - break; - - - case node_state::gradient_level_set: - - FOR_ALL(node_gid, 0, num_nodes, { - - // velocity, var is node_vel_id - node_vector_fields(node_grad_level_set_id, node_gid, 0) = Node.gradient_level_set(node_gid, 0); - node_vector_fields(node_grad_level_set_id, node_gid, 1) = Node.gradient_level_set(node_gid, 1); - if (num_dims == 2) { - node_vector_fields(node_grad_level_set_id, node_gid, 2) = 0.0; - } - else{ - node_vector_fields(node_grad_level_set_id, node_gid, 2) = Node.gradient_level_set(node_gid, 2); - } // end if - - }); // end parallel for - - break; - - // -- not used vars - case node_state::force: - break; - - // heat transer vars - case node_state::heat_transfer: - break; - // tensors - } // end switch - } // end for over - - - - } // end function - - ///////////////////////////////////////////////////////////////////////////// - /// - /// \fn write_vtu - /// - /// \brief Writes a vtu ASCII output file - /// - /// \param Simulation mesh - /// \param State data - /// \param Simulation parameters - /// \param current time value - /// \param Vector of all graphics output times - /// - ///////////////////////////////////////////////////////////////////////////// - void write_vtu( - const ViewCArray& node_coords_host, - const ViewCArray& nodes_in_elem_host, - const DCArrayKokkos& elem_scalar_fields, - const DCArrayKokkos& elem_tensor_fields, - const DCArrayKokkos& node_scalar_fields, - const DCArrayKokkos& node_vector_fields, - const std::vector& elem_scalar_var_names, - const std::vector& elem_tensor_var_names, - const std::vector& node_scalar_var_names, - const std::vector& node_vector_var_names, - const std::string partname, - const int graphics_id, - const size_t num_nodes, - const size_t num_elems, - const size_t num_nodes_in_elem, - const int Pn_order, - const size_t num_dims, - const size_t solver_id - ) - { - FILE* out[20]; // the output files that are written to - char filename[100]; // char string - int max_len = sizeof filename; - int str_output_len; - - const size_t num_elem_scalar_vars = elem_scalar_var_names.size(); - const size_t num_elem_tensor_vars = elem_tensor_var_names.size(); - - const size_t num_node_scalar_vars = node_scalar_var_names.size(); - const size_t num_node_vector_vars = node_vector_var_names.size(); - - - // create filename - str_output_len = snprintf(filename, max_len, "vtk/data/Fierro.solver%zu.%s.%05d.vtu", - solver_id, partname.c_str(), graphics_id); - - if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); } - // mesh file - - out[0] = fopen(filename, "w"); - - fprintf(out[0], "\n"); - fprintf(out[0], "\n"); - fprintf(out[0], " \n"); - fprintf(out[0], " \n", num_nodes, num_elems); - - /* - --------------------------------------------------------------------------- - Write the mesh points - --------------------------------------------------------------------------- - */ - fprintf(out[0], "\n"); - fprintf(out[0], " \n"); - fprintf(out[0], " \n"); - fprintf(out[0], " \n"); - - // write all components of the point coordinates - for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) { - double coord_z = 0.0; - if(num_dims==3){ - coord_z = node_coords_host(node_gid, 2); - } - fprintf(out[0], - " %f %f %f\n", - node_coords_host(node_gid, 0), - node_coords_host(node_gid, 1), - coord_z); - } // end for - fprintf(out[0], " \n"); - fprintf(out[0], " \n"); - - /* - --------------------------------------------------------------------------- - Write the elems - --------------------------------------------------------------------------- - */ - fprintf(out[0], "\n"); - fprintf(out[0], " \n"); - fprintf(out[0], " \n"); - fprintf(out[0], " \n"); - - // WARNING: look into high-order Pn 2D elements with paraview - int Pn_order_z = 0; - if (num_dims == 3){ - Pn_order_z = Pn_order; - } - int order[3] = {Pn_order, Pn_order, Pn_order_z}; - - // const int num_1D_points = Pn_order+1; - - // write all global point numbers for this elem - for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { - fprintf(out[0], " "); // adding indentation before printing nodes in element - if (num_dims==3 && Pn_order>1){ - for (int k = 0; k <= Pn_order_z; k++) { - for (int j = 0; j <= Pn_order; j++) { - for (int i = 0; i <= Pn_order; i++) { - size_t node_lid = PointIndexFromIJK(i, j, k, order); - fprintf(out[0], "%lu ", nodes_in_elem_host(elem_gid, node_lid)); - } - } - } // end for - } - else if (num_dims == 3 && Pn_order == 1){ - // 3D linear hexahedral elements - for (int node_lid = 0; node_lid < 8; node_lid++) { - fprintf(out[0], "%lu ", nodes_in_elem_host(elem_gid, node_lid)); - } // end for - } - else if (num_dims == 2){ - // 2D linear is the only supported option - for (int node_lid = 0; node_lid < 4; node_lid++) { - fprintf(out[0], "%lu ", nodes_in_elem_host(elem_gid, node_lid)); - } // end for - } - else { - std::cout << "ERROR: outputs failed, dimensions and element types are not compatible \n"; - } // end if - fprintf(out[0], "\n"); - } // end for - fprintf(out[0], " \n"); - - // Write the element offsets - fprintf(out[0], " \n"); - size_t count=0; - for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { - count += num_nodes_in_elem; - fprintf(out[0], " %lu\n", count); // num points in this elem + all others before it - } // end for - fprintf(out[0], " \n"); - - - // Write the element types - fprintf(out[0], " \n"); - // ---- - // linear element types - // VTK_PIXEL = 8, linear 2D quad with i,j,k indexing (future format for 2D solver) - // VTK_Quad = 9, linear 2D quad with ensight index ordering (current 2D rz convention) - // VTK_VOXEL = 11, linear 3D hex with i,j,k indexing (current format) - // arbitrary order types - // VTK_LAGRANGE_QUADRILATERAL = 70, use this type when a 2D high-order scheme exists - // VTK_LAGRANGE_HEXAHEDRON: 72, this is the current 3D high-order - // VTK_HIGHER_ORDER_HEXAHEDRON: 67 - // VTK_BIQUADRATIC_QUADRATIC_HEXAHEDRON = 33 - // element types: https://vtk.org/doc/nightly/html/vtkCellType_8h_source.html - // element types: https://kitware.github.io/vtk-js/api/Common_DataModel_CellTypes.html - // vtk format: https://www.kitware.com//modeling-arbitrary-order-lagrange-finite-elements-in-the-visualization-toolkit/ - for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { - if (num_dims==3 && Pn_order>1){ - fprintf(out[0], " %d \n", 72); - } - else if (num_dims == 3 && Pn_order == 1){ - // 3D linear hex - fprintf(out[0], " %d \n", 11); - } - else { - // 2D ensight mesh ordering - fprintf(out[0], " %d \n", 9); - } - } - fprintf(out[0], " \n"); - fprintf(out[0], " \n"); - - - /* - --------------------------------------------------------------------------- - Write the nodal variables to file - --------------------------------------------------------------------------- - */ - // vtk vector vars = (position, velocity) - fprintf(out[0], "\n"); - fprintf(out[0], " \n"); - if(num_node_vector_vars >0 || num_node_scalar_vars>0){ - - fprintf(out[0], " \n"); - - // node vectors - for (int a_var = 0; a_var < num_node_vector_vars; a_var++) { - fprintf(out[0], " \n", node_vector_var_names[a_var].c_str()); - - for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) { - fprintf(out[0], " %f %f %f\n", - node_vector_fields.host(a_var, node_gid, 0), - node_vector_fields.host(a_var, node_gid, 1), - node_vector_fields.host(a_var, node_gid, 2)); - } // end for nodes - fprintf(out[0], " \n"); - - } // end for vec_vars - - - // node scalar vars - for (int a_var = 0; a_var < num_node_scalar_vars; a_var++) { - fprintf(out[0], " \n", node_scalar_var_names[a_var].c_str()); - for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) { - fprintf(out[0], " %f\n", node_scalar_fields.host(a_var, node_gid)); - } // end for nodes - fprintf(out[0], " \n"); - } // end for vec_vars - - fprintf(out[0], " \n"); - - } // end if - - /* - --------------------------------------------------------------------------- - Write the elem variables to file - --------------------------------------------------------------------------- - */ - fprintf(out[0], "\n"); - fprintf(out[0], " \n"); - if(num_elem_scalar_vars >0 || num_elem_tensor_vars>0){ - - fprintf(out[0], " \n"); - - for (int a_var = 0; a_var < num_elem_scalar_vars; a_var++) { - - fprintf(out[0], " \n", elem_scalar_var_names[a_var].c_str()); // the 1 is number of scalar components [1:4] - - for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { - fprintf(out[0], " %f\n", elem_scalar_fields.host(a_var, elem_gid)); - } // end for elem - fprintf(out[0], " \n"); - } // end for elem scalar_vars - - - // tensors - for (int a_var = 0; a_var < num_elem_tensor_vars; a_var++) { - fprintf(out[0], " \n", elem_tensor_var_names[a_var].c_str()); // the 1 is number of scalar components [1:4] - - for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { - // note: paraview is row-major, CArray convention - // Txx Txy Txz Tyx Tyy Tyz Tzx Tzy Tzz - for (size_t i=0; i<3; i++){ - for(size_t j=0; j<3; j++){ - fprintf(out[0], " %f ", elem_tensor_fields.host(a_var, elem_gid, i, j)); - } // end j - } // end i - } // end for elem - fprintf(out[0], "\n"); - fprintf(out[0], " \n"); - } // end for elem scalar_vars - - fprintf(out[0], " \n"); - } // end if - - // end of the vtu file - fprintf(out[0], " \n"); - fprintf(out[0], " \n"); - fprintf(out[0], "\n"); - - //----------------- - // close the vtu file for element fields - //----------------- - fclose(out[0]); - - } // end write vtu - - - ///////////////////////////////////////////////////////////////////////////// - /// - /// \fn write_pvd - /// - /// \brief Writes a pvd ASCII output file for the element and nodal fields - /// - /// \param Vector of all graphics output times - /// \param element average field names - /// \param current time value - /// \param graphics index - /// - ///////////////////////////////////////////////////////////////////////////// - void write_pvd(CArray& graphics_times, - double time_value, - int graphics_id, - const size_t solver_id){ - - FILE* out[20]; // the output files that are written to - char filename[100]; // char string - int max_len = sizeof filename; - int str_output_len; - - // Write time series metadata - str_output_len = snprintf(filename, max_len, "vtk/Fierro.solver%zu.pvd", solver_id); - - if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); } - // mesh file - - out[0] = fopen(filename, "w"); - - fprintf(out[0], "\n"); - fprintf(out[0], "\n"); - fprintf(out[0], " \n"); - - for (int i = 0; i <= graphics_id; i++) { - fprintf(out[0], " \n", - graphics_times(i), solver_id, i, graphics_times(i) ); - //fprintf(out[0], " \n", - // i, solver_id, i, graphics_times(i) ); - } - - fprintf(out[0], " \n"); - fprintf(out[0], ""); - - fclose(out[0]); - - } // end pvd - - - ///////////////////////////////////////////////////////////////////////////// - /// - /// \fn write_vtm - /// - /// \brief Writes a vtm ASCII output file for all fields -- mesh and material - /// - /// \param Vector of all graphics output times - /// \param element average field names - /// \param current time value - /// \param graphics index - /// - ///////////////////////////////////////////////////////////////////////////// - void write_vtm(CArray& graphics_times, - const std::string& elem_part_name, - const std::string& mat_part_name, - double time_value, - int graphics_id, - int num_mats, - bool write_mesh_state, - bool write_mat_pt_state, - const size_t solver_id) - { - // loop over all the files that were written - for(int file_id=0; file_id<=graphics_id; file_id++){ - - FILE* out[20]; // the output files that are written to - char filename[100]; // char string - int max_len = sizeof filename; - int str_output_len; - - - // Write time series metadata to the data file - str_output_len = snprintf(filename, max_len, "vtk/data/Fierro.solver%zu.%05d.vtm", solver_id, file_id); - - if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); } - // mesh file - - out[0] = fopen(filename, "w"); - - fprintf(out[0], "\n"); - fprintf(out[0], "\n"); - fprintf(out[0], " \n"); - - - // Average mesh fields -- node and elem state written - size_t block_id = 0; // this will need to be incremented based on the number of mesh fields written - if (write_mesh_state){ - fprintf(out[0], " \n", block_id); - { - block_id++; // increment block id for material outputs that follow the element avg block - - // elem and nodal fields are in this file - fprintf(out[0], " \n"); - fprintf(out[0], " \n", - file_id, solver_id, elem_part_name.c_str(), file_id, graphics_times(file_id) ); - fprintf(out[0], " \n"); - - // add other Mesh average output Pieces here - } - fprintf(out[0], " \n"); - } // end if write elem and node state is true - - // note: the block_id was incremented if an element average field output was made - if (write_mat_pt_state){ - fprintf(out[0], " \n", block_id); - for (size_t mat_id=0; mat_id\n", mat_id, mat_id); - fprintf(out[0], " \n", - file_id, solver_id, mat_part_name.c_str(), mat_id, file_id, graphics_times(file_id) ); - fprintf(out[0], " \n"); - - } // end for loop mat_id - fprintf(out[0], " \n"); - } // end if write mat satte is true - - // done writing the files to be read by the vtm file - fprintf(out[0], " \n"); - fprintf(out[0], ""); - - fclose(out[0]); - - } // end for file_id - - } // end vtm - - - ///////////////////////////////////////////////////////////////////////////// - /// - /// \fn build_material_elem_node_lists - /// - /// \brief Creates elems and nodes for a unique mesh of a material (i.e, a part) - /// - /// \param Simulation mesh - /// \param State node data - /// \param Material node coordinates - /// \param Material nodes in the material element - /// \param Material to mesh map for elements - /// \param number of material nodes - /// \param number of material elements - /// \param number of nodes in the element - /// \param number of dimensions - /// - ///////////////////////////////////////////////////////////////////////////// - void build_material_elem_node_lists( - const Mesh_t& mesh, - const DCArrayKokkos& state_node_coords, - DCArrayKokkos& mat_node_coords, - DCArrayKokkos & mat_nodes_in_mat_elem, - const DRaggedRightArrayKokkos& elem_in_mat_elem, - const size_t mat_id, - size_t& num_mat_nodes, - const size_t num_mat_elems, - const size_t num_nodes_in_elem, - const size_t num_dims) - { - - // helper arrays - DCArrayKokkos dummy_counter(mesh.num_nodes, "dummy_counter"); - DCArrayKokkos access_mat_node_gids(mesh.num_nodes, "access_mat_node_gids"); - dummy_counter.set_values(0); - - // tag and count the number of nodes in this part - FOR_ALL (mat_elem_sid, 0, num_mat_elems, { - // get elem gid - size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid); // WARNING not GPU compatible - - // parallel loop over the nodes in the element - for(size_t node_lid=0; node_lid0 - - } // end for nodes in element - - }); // end parallel for - Kokkos::fence(); - dummy_counter.update_host(); - - // loop opperation is not thread safe, must be run serially - size_t mat_node_gid = 0; - for(size_t node_gid = 0; node_gid0){ - mat_node_coords.host(mat_node_gid, 0) = state_node_coords.host(node_gid, 0); - mat_node_coords.host(mat_node_gid, 1) = state_node_coords.host(node_gid, 1); - if (num_dims == 3){ - mat_node_coords.host(mat_node_gid, 2) = state_node_coords.host(node_gid, 2); - } // end if on dims - - access_mat_node_gids.host(node_gid) = mat_node_gid; // the part node id - - mat_node_gid ++; - - dummy_counter.host(node_gid) = 0; // set counter to zero, it was accounted for - } // end if this node is on the part - - } // end loop over all mesh nodes - mat_node_coords.update_device(); - access_mat_node_gids.update_device(); - dummy_counter.update_device(); - Kokkos::fence(); - - // save the number of nodes defining the material region, i.e., the part - num_mat_nodes = mat_node_gid; - - // save the new node id's - FOR_ALL (mat_elem_sid, 0, num_mat_elems, { - // get elem gid - size_t elem_gid = elem_in_mat_elem(mat_id, mat_elem_sid); - - // parallel loop over the nodes in the element - for(size_t node_lid=0; node_lid graphics_times, - std::vector node_states, - std::vector gauss_pt_states, - std::vector material_pt_states) - { - // WARNING WARNING WARNING: - // This currently assumes the gauss and material point IDs are the same as the element ID - // This will need to be updated for high order methods - - // Update host data - // ---- Update host data ---- - size_t num_mats = State.MaterialPoints.num_material_points.size(); - - State.MaterialPoints.den.update_host(); - State.MaterialPoints.pres.update_host(); - State.MaterialPoints.stress.update_host(); - State.MaterialPoints.sspd.update_host(); - State.MaterialPoints.sie.update_host(); - State.MaterialPoints.mass.update_host(); - - State.GaussPoints.vol.update_host(); - - State.node.coords.update_host(); - State.node.vel.update_host(); - State.node.mass.update_host(); - - Kokkos::fence(); - - struct stat st; - - if (stat("state", &st) != 0) { - system("mkdir state"); - } - - size_t num_dims = mesh.num_dims; - - // --------------------------------------------------------------------------- - // Setup of file and directory for exporting - // --------------------------------------------------------------------------- - - // output file - FILE* out_elem_state; // element average state - char filename[128]; - - int max_len = sizeof filename; - - snprintf(filename, max_len, "state/mat_pt_state_t_%6.4e.txt", time_value); - - // output files - out_elem_state = fopen(filename, "w"); - - // write state dump - fprintf(out_elem_state, "# state dump file\n"); - fprintf(out_elem_state, "# x y z radius_2D radius_3D den pres sie sspd vol mass \n"); - - // write out values for the elem - for (size_t mat_id = 0; mat_id < num_mats; mat_id++) { - - size_t num_mat_elems = State.MaterialToMeshMaps.num_mat_elems.host(mat_id); - - for (size_t mat_elem_sid = 0; mat_elem_sid < num_mat_elems; mat_elem_sid++) - { - - const size_t elem_gid = State.MaterialToMeshMaps.elem_in_mat_elem.host(mat_id, mat_elem_sid); - - double elem_coords[3]; - elem_coords[0] = 0.0; - elem_coords[1] = 0.0; - elem_coords[2] = 0.0; - - // get the coordinates of the element center - for (size_t node_lid = 0; node_lid < mesh.num_nodes_in_elem; node_lid++) { - - elem_coords[0] += State.node.coords.host(mesh.nodes_in_elem.host(elem_gid, node_lid), 0); - elem_coords[1] += State.node.coords.host(mesh.nodes_in_elem.host(elem_gid, node_lid), 1); - if (num_dims == 3) { - elem_coords[2] += State.node.coords.host(mesh.nodes_in_elem.host(elem_gid, node_lid), 2); - } - else{ - elem_coords[2] = 0.0; - } - } // end loop over nodes in element - - elem_coords[0] = elem_coords[0] / ((double)mesh.num_nodes_in_elem); - elem_coords[1] = elem_coords[1] / ((double)mesh.num_nodes_in_elem); - elem_coords[2] = elem_coords[2] / ((double)mesh.num_nodes_in_elem); - - double rad2 = sqrt(elem_coords[0] * elem_coords[0] + - elem_coords[1] * elem_coords[1]); - - double rad3 = sqrt(elem_coords[0] * elem_coords[0] + - elem_coords[1] * elem_coords[1] + - elem_coords[2] * elem_coords[2]); - - - fprintf(out_elem_state, "%4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t \n", - elem_coords[0], - elem_coords[1], - elem_coords[2], - rad2, - rad3, - State.MaterialPoints.den.host(mat_id, mat_elem_sid), - State.MaterialPoints.pres.host(mat_id, mat_elem_sid), - State.MaterialPoints.sie.host(mat_id, mat_elem_sid), - State.MaterialPoints.sspd.host(mat_id, mat_elem_sid), - State.GaussPoints.vol.host(elem_gid), - State.MaterialPoints.mass.host(mat_id, mat_elem_sid) ); - - } // end for elements - - } // end for materials - fclose(out_elem_state); - - - - // printing nodal state - - FILE* out_point_state; // element average state - - snprintf(filename, max_len, "state/node_state_t_%6.4e.txt", time_value); - - // output files - out_point_state = fopen(filename, "w"); - - // write state dump - fprintf(out_point_state, "# state node dump file\n"); - fprintf(out_point_state, "# x y z radius_2D radius_3D vel_x vel_y vel_z speed ||err_v_dot_r|| \n"); - - // get the coordinates of the node - for (size_t node_gid = 0; node_gid < mesh.num_nodes; node_gid++) { - - double node_coords[3]; - - node_coords[0] = State.node.coords.host(node_gid, 0); - node_coords[1] = State.node.coords.host(node_gid, 1); - if (num_dims == 3) { - node_coords[2] = State.node.coords.host(node_gid, 2); - } - else{ - node_coords[2] = 0.0; - } - - double rad2 = sqrt(node_coords[0] * node_coords[0] + - node_coords[1] * node_coords[1]); - double rad3 = sqrt(node_coords[0] * node_coords[0] + - node_coords[1] * node_coords[1] + - node_coords[2] * node_coords[2]); - - double node_vel[3]; - - node_vel[0] = State.node.vel.host(node_gid, 0); - node_vel[1] = State.node.vel.host(node_gid, 1); - if (num_dims == 3) { - node_vel[2] = State.node.vel.host(node_gid, 2); - } - else{ - node_vel[2] = 0.0; - } - - double speed = sqrt(node_vel[0] * node_vel[0] + - node_vel[1] * node_vel[1] + - node_vel[2] * node_vel[2]); - - - - // looking at perfect radial motion - double unit_r_vec[2]; - unit_r_vec[0] = node_coords[0]/rad2; - unit_r_vec[1] = node_coords[1]/rad2; - - //the radial motion - double v_dot_r = node_vel[0] * unit_r_vec[0] + - node_vel[1] * unit_r_vec[1]; - - - double err_v_dot_r[3]; - err_v_dot_r[0] = node_vel[0]-unit_r_vec[0]*v_dot_r; - err_v_dot_r[1] = node_vel[1]-unit_r_vec[1]*v_dot_r; - - double mag_err_v_dot_r = sqrt(err_v_dot_r[0]*err_v_dot_r[0] + err_v_dot_r[1]*err_v_dot_r[1]); - - fprintf(out_point_state, "%4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t %4.12e\t \n", - node_coords[0], - node_coords[1], - node_coords[2], - rad2, - rad3, - node_vel[0], - node_vel[1], - node_vel[2], - speed, - mag_err_v_dot_r); - - - } // end loop over nodes in element - + // update device side + mesh.nodes_in_elem.update_device(); - fclose(out_point_state); - return; - } // end of state output -}; // end class + // Build connectivity + mesh.build_connectivity(); +} // end build_3d_box -#endif // end Header Guard \ No newline at end of file +#endif \ No newline at end of file diff --git a/examples/mesh_decomp/state.h b/examples/mesh_decomp/state.h new file mode 100644 index 00000000..7a1cb676 --- /dev/null +++ b/examples/mesh_decomp/state.h @@ -0,0 +1,139 @@ +/********************************************************************************************** +� 2020. Triad National Security, LLC. All rights reserved. +This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos +National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S. +Department of Energy/National Nuclear Security Administration. All rights in the program are +reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear +Security Administration. The Government is granted for itself and others acting on its behalf a +nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare +derivative works, distribute copies to the public, perform publicly and display publicly, and +to permit others to do so. +This program is open source under the BSD-3 License. +Redistribution and use in source and binary forms, with or without modification, are permitted +provided that the following conditions are met: +1. Redistributions of source code must retain the above copyright notice, this list of +conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, this list of +conditions and the following disclaimer in the documentation and/or other materials +provided with the distribution. +3. Neither the name of the copyright holder nor the names of its contributors may be used +to endorse or promote products derived from this software without specific prior +written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************************/ +#ifndef STATE_H +#define STATE_H + +#include "matar.h" + +using namespace mtr; + + +// Possible node states, used to initialize node_t +enum class node_state +{ + coords +}; + + +///////////////////////////////////////////////////////////////////////////// +/// +/// \struct node_t +/// +/// \brief Stores state information associated with a node +/// +///////////////////////////////////////////////////////////////////////////// +struct node_t +{ + DCArrayKokkos coords; ///< Nodal coordinates + DCArrayKokkos coords_n0; ///< Nodal coordinates at tn=0 of time integration + + // initialization method (num_nodes, num_dims, state to allocate) + void initialize(size_t num_nodes, size_t num_dims, std::vector node_states) + { + for (auto field : node_states){ + switch(field){ + case node_state::coords: + if (coords.size() == 0) this->coords = DCArrayKokkos(num_nodes, num_dims, "node_coordinates"); + if (coords_n0.size() == 0) this->coords_n0 = DCArrayKokkos(num_nodes, num_dims, "node_coordinates_n0"); + break; + default: + std::cout<<"Desired node state not understood in node_t initialize"< vol; ///< GaussPoint volume + + + // initialization method (num_cells, num_dims) + void initialize(size_t num_gauss_pnts, size_t num_dims, std::vector gauss_pt_states) + { + + for (auto field : gauss_pt_states){ + switch(field){ + case gauss_pt_state::volume: + if (vol.size() == 0) this->vol = DCArrayKokkos(num_gauss_pnts, "gauss_point_volume"); + break; + default: + std::cout<<"Desired gauss point state not understood in GaussPoint_t initialize"< Date: Wed, 22 Oct 2025 20:08:00 -0500 Subject: [PATCH 03/52] ENH: Adding mesh decomposition example WIP, Initial decomposition nearly done --- examples/mesh_decomp/mesh.h | 7 + examples/mesh_decomp/mesh_decomp.cpp | 338 ++++++++++++++++++++++++++- 2 files changed, 335 insertions(+), 10 deletions(-) diff --git a/examples/mesh_decomp/mesh.h b/examples/mesh_decomp/mesh.h index 9a7140a3..6d1e31d7 100644 --- a/examples/mesh_decomp/mesh.h +++ b/examples/mesh_decomp/mesh.h @@ -303,6 +303,13 @@ struct Mesh_t RaggedRightArrayKokkos bdy_nodes_in_set; ///< Boundary nodes in a boundary set DCArrayKokkos num_bdy_nodes_in_set; ///< Number of boundary nodes in a set + + // MPI Decomposition Data Definitions ---- // + DCArrayKokkos local_to_global_node_mapping; ///< Local to global node mapping + + DCArrayKokkos local_to_global_elem_mapping; ///< Local to global element mapping + + // initialization methods void initialize_nodes(const size_t num_nodes_inp) { diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp index 595ab4e0..1aa70f16 100644 --- a/examples/mesh_decomp/mesh_decomp.cpp +++ b/examples/mesh_decomp/mesh_decomp.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include "mesh.h" @@ -15,6 +16,33 @@ #include "ptscotch.h" +void calc_elements_per_rank(std::vector& elems_per_rank, int num_elems, int world_size){ + // Compute elements to send to each rank; handle remainders for non-even distribution + std::fill(elems_per_rank.begin(), elems_per_rank.end(), num_elems / world_size); + int remainder = num_elems % world_size; + for (int i = 0; i < remainder; ++i) { + elems_per_rank[i] += 1; + } +} + +void print_mesh_info(Mesh_t& mesh){ + std::cout<<"Mesh has "< elements_on_rank; + std::vector nodes_on_rank; + + + std::vector elems_per_rank(world_size); + std::vector nodes_per_rank(world_size); + + // create a 2D vector of elements to send to each rank + std::vector> elements_to_send(world_size); + + // create a 2D vector of nodes to send to each rank + std::vector> nodes_to_send(world_size); if (rank == 0) { - std::cout<<"Rank "< all_elements; + std::vector sendcounts(world_size); + std::vector displs(world_size); + + int displacement = 0; + for (int i = 0; i < world_size; i++) { + sendcounts[i] = elems_per_rank[i]; + displs[i] = displacement; + // Copy elements for rank i to the flattened array + for (int j = 0; j < elems_per_rank[i]; j++) { + all_elements.push_back(elements_to_send[i][j]); + } + displacement += elems_per_rank[i]; + } + // Send the elements to each rank + MPI_Scatterv(all_elements.data(), sendcounts.data(), displs.data(), MPI_INT, + elements_on_rank.data(), num_elements_on_rank, MPI_INT, + 0, MPI_COMM_WORLD); + } + else { + MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT, + elements_on_rank.data(), num_elements_on_rank, MPI_INT, + 0, MPI_COMM_WORLD); } + + MPI_Barrier(MPI_COMM_WORLD); + std::cout << "Rank " << rank << " received elements: "; + for (int i = 0; i < num_elements_on_rank; i++) { + std::cout << elements_on_rank[i] << " "; + } + std::cout << std::endl; + if (rank == 0) { + + // Populate the nodes_to_send array by finding all nodes in the elements in elements_to_send and removing duplicates + for (int i = 0; i < world_size; i++) { + std::set nodes_set; + for (int j = 0; j < elems_per_rank[i]; j++) { + for (int k = 0; k < 8; k++) { + nodes_set.insert(initial_mesh.nodes_in_elem.host(elements_to_send[i][j], k)); + } + } + nodes_to_send[i] = std::vector(nodes_set.begin(), nodes_set.end()); + } + for (int i = 0; i < world_size; i++) { + nodes_per_rank[i] = nodes_to_send[i].size(); + } + } + + // Send the number of nodes to each rank using MPI_scatter + MPI_Scatter(nodes_per_rank.data(), 1, MPI_INT, + &num_nodes_on_rank, 1, MPI_INT, + 0, MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); + std::cout << "Rank " << rank << " received " << num_nodes_on_rank << " nodes" << std::endl; + nodes_on_rank.resize(num_nodes_on_rank); + + if (rank == 0) { + + // print the nodes_to_send array + for (int i = 0; i < world_size; i++) { + std::cout< all_nodes; + std::vector sendcounts(world_size); + std::vector displs(world_size); + + int displacement = 0; + for (int i = 0; i < world_size; i++) { + sendcounts[i] = nodes_to_send[i].size(); + displs[i] = displacement; + // Copy nodes for rank i to the flattened array + for (int j = 0; j < nodes_to_send[i].size(); j++) { + all_nodes.push_back(nodes_to_send[i][j]); + } + displacement += nodes_to_send[i].size(); + } + // Send the nodes to each rank + // all_nodes.data(): Pointer to the flattened array of all nodes to be sent to each rank + // sendcounts.data(): Array with the number of nodes to send to each rank + // displs.data(): Array with the displacement for each rank in the flattened array + // MPI_INT: Data type of the nodes (integer) + // nodes_on_rank.data(): Pointer to the buffer where each rank will receive its nodes + // num_nodes_on_rank: Number of nodes that the receiving rank expects to receive + // MPI_INT: Data type of the receive buffer (integer) + // 0: The root rank (rank 0) that is performing the scatter + // MPI_COMM_WORLD: The communicator + MPI_Scatterv(all_nodes.data(), sendcounts.data(), displs.data(), MPI_INT, + nodes_on_rank.data(), num_nodes_on_rank, MPI_INT, + 0, MPI_COMM_WORLD); + } + else { + + MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT, + nodes_on_rank.data(), num_nodes_on_rank, MPI_INT, + 0, MPI_COMM_WORLD); + + } + + MPI_Barrier(MPI_COMM_WORLD); + + std::cout << "Rank " << rank << " received nodes: "; + for (int i = 0; i < num_nodes_on_rank; i++) { + std::cout << nodes_on_rank[i] << " "; + } + std::cout << std::endl; - if (rank == 0) std::cout<<"Finished decomposition"< nodes_in_elem_on_rank; + + // All ranks need to resize their receive buffer + nodes_in_elem_on_rank.resize(num_elements_on_rank * 8); + if (rank == 0) { + // Prepare element-node connectivity data for each rank + std::vector all_nodes_in_elem; + std::vector sendcounts(world_size); + std::vector displs(world_size); + + int displacement = 0; + for(int i = 0; i < world_size; i++) { + int num_connectivity_entries = elements_to_send[i].size() * 8; // 8 nodes per element + sendcounts[i] = num_connectivity_entries; + displs[i] = displacement; + + // Copy element-node connectivity for rank i + for(int j = 0; j < elements_to_send[i].size(); j++) { + for(int k = 0; k < 8; k++) { + all_nodes_in_elem.push_back(initial_mesh.nodes_in_elem.host(elements_to_send[i][j], k)); + } + } + displacement += num_connectivity_entries; + } + + // Send the connectivity data to each rank + MPI_Scatterv(all_nodes_in_elem.data(), sendcounts.data(), displs.data(), MPI_INT, + nodes_in_elem_on_rank.data(), num_elements_on_rank * 8, MPI_INT, + 0, MPI_COMM_WORLD); + } + else { + MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT, + nodes_in_elem_on_rank.data(), num_elements_on_rank * 8, MPI_INT, + 0, MPI_COMM_WORLD); + } + + MPI_Barrier(MPI_COMM_WORLD); + + if (rank == 0) { + + std::cout << "Rank " << rank << " received element-node connectivity (" + << num_elements_on_rank << " elements, " << nodes_in_elem_on_rank.size() << " entries):" << std::endl; + for (int elem = 0; elem < num_elements_on_rank; elem++) { + std::cout << " Element " << elem << " nodes: "; + for (int node = 0; node < 8; node++) { + int idx = elem * 8 + node; + std::cout << nodes_in_elem_on_rank[idx] << " "; + } + std::cout << std::endl; + } + } + + MPI_Barrier(MPI_COMM_WORLD); + + if (rank == 1) { + + std::cout << "Rank " << rank << " received element-node connectivity (" + << num_elements_on_rank << " elements, " << nodes_in_elem_on_rank.size() << " entries):" << std::endl; + for (int elem = 0; elem < num_elements_on_rank; elem++) { + std::cout << " Element " << elem << " nodes: "; + for (int node = 0; node < 8; node++) { + int idx = elem * 8 + node; + std::cout << nodes_in_elem_on_rank[idx] << " "; + } + std::cout << std::endl; + } + } + + mesh.initialize_nodes(num_nodes_on_rank); + + std::vector required_node_state = { node_state::coords }; + + + mesh.initialize_elems(num_elements_on_rank, 3); + + + // WARNING WARNING WARNING: THIS IS WRONG< SHOULD BE LOCAL ID. Figure this out + for(int i = 0; i < num_elements_on_rank; i++) { + for(int j = 0; j < 8; j++) { + mesh.nodes_in_elem.host(i, j) = nodes_in_elem_on_rank[i * 8 + j]; + } + } + + mesh.nodes_in_elem.update_device(); + + + mesh.local_to_global_node_mapping = DCArrayKokkos(num_nodes_on_rank, "mesh.local_to_global_node_mapping"); + mesh.local_to_global_elem_mapping = DCArrayKokkos(num_elements_on_rank, "mesh.local_to_global_elem_mapping"); + + for(int i = 0; i < num_nodes_on_rank; i++) { + mesh.local_to_global_node_mapping.host(i) = nodes_on_rank[i]; + } + + for(int i = 0; i < num_elements_on_rank; i++) { + mesh.local_to_global_elem_mapping.host(i) = elements_on_rank[i]; + } + + mesh.local_to_global_node_mapping.update_device(); + mesh.local_to_global_elem_mapping.update_device(); + // in kernel, I will do the following + // On each rank, I need: + // 1. Numnber of nodes + // 2. node coordinates + // 3. number of elements + // 5. Local node to global node mapping + // 6. Local element to global element mapping + // 7. Element-node connectivity + // With the above, I can call build connectivity on the local mesh + + + + // elements_on_rank is now received via MPI_Scatterv above + + + + + // if (rank == 0) std::cout<<"Finished"< Date: Mon, 27 Oct 2025 11:58:17 -0500 Subject: [PATCH 04/52] ENH: Tidying up initial decomposition --- examples/mesh_decomp/mesh.h | 15 +- examples/mesh_decomp/mesh_decomp.cpp | 391 +++++++++++++++++++++------ examples/mesh_decomp/mesh_io.h | 331 ++++++++++++++++++++++- 3 files changed, 640 insertions(+), 97 deletions(-) diff --git a/examples/mesh_decomp/mesh.h b/examples/mesh_decomp/mesh.h index 6d1e31d7..0011d2e8 100644 --- a/examples/mesh_decomp/mesh.h +++ b/examples/mesh_decomp/mesh.h @@ -221,6 +221,8 @@ struct Mesh_t // Patch: A discretization of a surface by subdividing the surface using the nodes // Corner: A element-node pair + bool verbose = false; + // ---- Global Mesh Definitions ---- // mesh_init::elem_name_tag elem_kind = mesh_init::linear_tensor_element; ///< The type of elements used in the mesh @@ -308,6 +310,7 @@ struct Mesh_t DCArrayKokkos local_to_global_node_mapping; ///< Local to global node mapping DCArrayKokkos local_to_global_elem_mapping; ///< Local to global element mapping + // initialization methods @@ -550,7 +553,7 @@ struct Mesh_t // DCArrayKokkos gauss_ordering_in_elem; // dimensions will be (num_patches_in_elem, num_gauss_in_patch); - printf("Number of dimensions = %zu \n", num_dims); + if (verbose) printf("Number of dimensions = %zu \n", num_dims); if (num_dims == 3) { // num_patches_in_surf = [1^2, 2^2, 3^2, 4^2, ... , Pn^2] @@ -973,7 +976,7 @@ struct Mesh_t node_ordering_in_elem.update_device(); Kokkos::fence(); - printf("Built node ordering \n"); + if (verbose) printf("Built node ordering \n"); // for saving the hash keys of the patches and then the neighboring elem_gid CArrayKokkos hash_keys_in_elem(num_elems, num_patches_in_elem, num_nodes_in_patch, "hash_keys_in_elem"); // always 4 ids in 3D @@ -1447,16 +1450,16 @@ struct Mesh_t void build_connectivity() { build_corner_connectivity(); - printf("Built corner connectivity \n"); + if (verbose) printf("Built corner connectivity \n"); build_elem_elem_connectivity(); - printf("Built element-element connectivity \n"); + if (verbose) printf("Built element-element connectivity \n"); build_patch_connectivity(); - printf("Built patch connectivity \n"); + if (verbose) printf("Built patch connectivity \n"); build_node_node_connectivity(); - printf("Built node-node connectivity \n"); + if (verbose) printf("Built node-node connectivity \n"); } ///////////////////////////////////////////////////////////////////////////// diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp index 1aa70f16..5b6635b7 100644 --- a/examples/mesh_decomp/mesh_decomp.cpp +++ b/examples/mesh_decomp/mesh_decomp.cpp @@ -15,6 +15,55 @@ #include "scotch.h" #include "ptscotch.h" +// Timer class for timing the execution of the matrix multiplication +class Timer { + private: + std::chrono::high_resolution_clock::time_point start_time; + std::chrono::high_resolution_clock::time_point end_time; + bool is_running; + + public: + Timer() : is_running(false) {} + + void start() { + start_time = std::chrono::high_resolution_clock::now(); + is_running = true; + } + + double stop() { + if (!is_running) { + std::cerr << "Timer was not running!" << std::endl; + return 0.0; + } + end_time = std::chrono::high_resolution_clock::now(); + is_running = false; + + auto duration = std::chrono::duration_cast(end_time - start_time); + return duration.count() / 1000.0; // Convert to milliseconds + } +}; + +void print_rank_mesh_info(Mesh_t& mesh, int rank) { + + std::cout<& elems_per_rank, int num_elems, int world_size){ // Compute elements to send to each rank; handle remainders for non-even distribution @@ -39,13 +88,16 @@ void print_mesh_info(Mesh_t& mesh){ std::cout< elements_on_rank; std::vector nodes_on_rank; - std::vector elems_per_rank(world_size); - std::vector nodes_per_rank(world_size); + std::vector elems_per_rank(world_size); // number of elements to send to each rank size(world_size) + std::vector nodes_per_rank(world_size); // number of nodes to send to each rank size(world_size) // create a 2D vector of elements to send to each rank std::vector> elements_to_send(world_size); @@ -87,6 +141,16 @@ int main(int argc, char** argv) { // create a 2D vector of nodes to send to each rank std::vector> nodes_to_send(world_size); + // Create a 2D vector to hold the nodal positions on each rank + std::vector> node_pos_to_send(world_size); + + // create a 2D vector to hold the node positions on each rank + std::vector> node_pos_on_rank(world_size); + + +// ******************************************************** +// Build the initial mesh +// ******************************************************** if (rank == 0) { std::cout<<"World size: "< nodes_set; for (int j = 0; j < elems_per_rank[i]; j++) { - for (int k = 0; k < 8; k++) { + for (int k = 0; k < num_nodes_per_elem; k++) { nodes_set.insert(initial_mesh.nodes_in_elem.host(elements_to_send[i][j], k)); } } nodes_to_send[i] = std::vector(nodes_set.begin(), nodes_set.end()); - } + } - for (int i = 0; i < world_size; i++) { - nodes_per_rank[i] = nodes_to_send[i].size(); + if (print_info) { + + for (int i = 0; i < world_size; i++) { + nodes_per_rank[i] = nodes_to_send[i].size(); + } + std::cout< node_pos_on_rank_flat(num_nodes_on_rank * 3); + + if(rank == 0) + { + for (int i = 0; i < world_size; i++) { + for(int node_gid = 0; node_gid < nodes_to_send[i].size(); node_gid++) + { + node_pos_to_send[i].push_back(initial_node.coords.host(nodes_to_send[i][node_gid], 0)); + node_pos_to_send[i].push_back(initial_node.coords.host(nodes_to_send[i][node_gid], 1)); + node_pos_to_send[i].push_back(initial_node.coords.host(nodes_to_send[i][node_gid], 2)); + } + } + + // Prepare data for MPI_Scatterv (scatter with variable counts) + // Flatten the 2D node_pos_to_send into a 1D array + std::vector all_node_pos; + std::vector sendcounts(world_size); + std::vector displs(world_size); + + int displacement = 0; + for (int i = 0; i < world_size; i++) { + sendcounts[i] = nodes_to_send[i].size() * 3; + displs[i] = displacement; // displacement is the starting index of the nodes for the current rank in the flattened array + // Copy node positions for rank i to the flattened array + for(int j = 0; j < nodes_to_send[i].size(); j++) { + for(int k = 0; k < 3; k++) { + all_node_pos.push_back(node_pos_to_send[i][j * 3 + k]); + } + } + displacement += nodes_to_send[i].size() * 3; + } + + // Send the node positions to each rank + MPI_Scatterv(all_node_pos.data(), sendcounts.data(), displs.data(), MPI_DOUBLE, + node_pos_on_rank_flat.data(), num_nodes_on_rank * 3, MPI_DOUBLE, + 0, MPI_COMM_WORLD); + } + else { + MPI_Scatterv(nullptr, nullptr, nullptr, MPI_DOUBLE, + node_pos_on_rank_flat.data(), num_nodes_on_rank * 3, MPI_DOUBLE, + 0, MPI_COMM_WORLD); + } + + MPI_Barrier(MPI_COMM_WORLD); + + if (rank == 0 && print_info) { + // Print out the node positions on this rank + std::cout << "Rank " << rank << " received node positions: "; + for (int i = 0; i < num_nodes_on_rank; i++) { + std::cout << "(" << node_pos_on_rank_flat[i*3] << ", " + << node_pos_on_rank_flat[i*3+1] << ", " + << node_pos_on_rank_flat[i*3+2] << ") "; + } + std::cout << std::endl; } - std::cout << std::endl; + MPI_Barrier(MPI_COMM_WORLD); - + if (rank == 1 && print_info) { + // Print out the node positions on this rank + std::cout << "Rank " << rank << " received node positions: "; + for (int i = 0; i < num_nodes_on_rank; i++) { + std::cout << "(" << node_pos_on_rank_flat[i*3] << ", " + << node_pos_on_rank_flat[i*3+1] << ", " + << node_pos_on_rank_flat[i*3+2] << ") "; + } + std::cout << std::endl; + } +// ****************************************************************************************** +// Initialize the node state variables +// ****************************************************************************************** + + // initialize node state variables, for now, we just need coordinates, the rest will be initialize by the respective solvers + std::vector required_node_state = { node_state::coords }; + node.initialize(num_nodes_on_rank, 3, required_node_state); + + for(int i = 0; i < num_nodes_on_rank; i++) { + node.coords.host(i, 0) = node_pos_on_rank_flat[i*3]; + node.coords.host(i, 1) = node_pos_on_rank_flat[i*3+1]; + node.coords.host(i, 2) = node_pos_on_rank_flat[i*3+2]; + } + + node.coords.update_device(); + +// ****************************************************************************************** +// Send the element-node connectivity data from the initial mesh to each rank +// ****************************************************************************************** // Send the element-node connectivity data from the initial mesh to each rank - std::vector nodes_in_elem_on_rank; - - // All ranks need to resize their receive buffer - nodes_in_elem_on_rank.resize(num_elements_on_rank * 8); + std::vector nodes_in_elem_on_rank(num_elements_on_rank * num_nodes_per_elem); if (rank == 0) { // Prepare element-node connectivity data for each rank @@ -276,40 +463,39 @@ int main(int argc, char** argv) { int displacement = 0; for(int i = 0; i < world_size; i++) { - int num_connectivity_entries = elements_to_send[i].size() * 8; // 8 nodes per element + int num_connectivity_entries = elements_to_send[i].size() * num_nodes_per_elem; // num_nodes_per_elem nodes per element sendcounts[i] = num_connectivity_entries; displs[i] = displacement; // Copy element-node connectivity for rank i for(int j = 0; j < elements_to_send[i].size(); j++) { - for(int k = 0; k < 8; k++) { + for(int k = 0; k < num_nodes_per_elem; k++) { all_nodes_in_elem.push_back(initial_mesh.nodes_in_elem.host(elements_to_send[i][j], k)); } } displacement += num_connectivity_entries; } - // Send the connectivity data to each rank MPI_Scatterv(all_nodes_in_elem.data(), sendcounts.data(), displs.data(), MPI_INT, - nodes_in_elem_on_rank.data(), num_elements_on_rank * 8, MPI_INT, + nodes_in_elem_on_rank.data(), num_elements_on_rank * num_nodes_per_elem, MPI_INT, 0, MPI_COMM_WORLD); } else { MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT, - nodes_in_elem_on_rank.data(), num_elements_on_rank * 8, MPI_INT, + nodes_in_elem_on_rank.data(), num_elements_on_rank * num_nodes_per_elem, MPI_INT, 0, MPI_COMM_WORLD); } MPI_Barrier(MPI_COMM_WORLD); - if (rank == 0) { + if (rank == 0 && print_info) { std::cout << "Rank " << rank << " received element-node connectivity (" << num_elements_on_rank << " elements, " << nodes_in_elem_on_rank.size() << " entries):" << std::endl; for (int elem = 0; elem < num_elements_on_rank; elem++) { std::cout << " Element " << elem << " nodes: "; - for (int node = 0; node < 8; node++) { - int idx = elem * 8 + node; + for (int node = 0; node < num_nodes_per_elem; node++) { + int idx = elem * num_nodes_per_elem + node; std::cout << nodes_in_elem_on_rank[idx] << " "; } std::cout << std::endl; @@ -318,38 +504,27 @@ int main(int argc, char** argv) { MPI_Barrier(MPI_COMM_WORLD); - if (rank == 1) { + // if (rank == 1) { - std::cout << "Rank " << rank << " received element-node connectivity (" - << num_elements_on_rank << " elements, " << nodes_in_elem_on_rank.size() << " entries):" << std::endl; - for (int elem = 0; elem < num_elements_on_rank; elem++) { - std::cout << " Element " << elem << " nodes: "; - for (int node = 0; node < 8; node++) { - int idx = elem * 8 + node; - std::cout << nodes_in_elem_on_rank[idx] << " "; - } - std::cout << std::endl; - } - } - - mesh.initialize_nodes(num_nodes_on_rank); - - std::vector required_node_state = { node_state::coords }; + // std::cout << "Rank " << rank << " received element-node connectivity (" + // << num_elements_on_rank << " elements, " << nodes_in_elem_on_rank.size() << " entries):" << std::endl; + // for (int elem = 0; elem < num_elements_on_rank; elem++) { + // std::cout << " Element " << elem << " nodes: "; + // for (int node = 0; node < num_nodes_per_elem; node++) { + // int idx = elem * num_nodes_per_elem + node; + // std::cout << nodes_in_elem_on_rank[idx] << " "; + // } + // std::cout << std::endl; + // } + // } +// ****************************************************************************************** +// Initialize the mesh data structures for each rank +// ****************************************************************************************** + mesh.initialize_nodes(num_nodes_on_rank); mesh.initialize_elems(num_elements_on_rank, 3); - - // WARNING WARNING WARNING: THIS IS WRONG< SHOULD BE LOCAL ID. Figure this out - for(int i = 0; i < num_elements_on_rank; i++) { - for(int j = 0; j < 8; j++) { - mesh.nodes_in_elem.host(i, j) = nodes_in_elem_on_rank[i * 8 + j]; - } - } - - mesh.nodes_in_elem.update_device(); - - mesh.local_to_global_node_mapping = DCArrayKokkos(num_nodes_on_rank, "mesh.local_to_global_node_mapping"); mesh.local_to_global_elem_mapping = DCArrayKokkos(num_elements_on_rank, "mesh.local_to_global_elem_mapping"); @@ -363,27 +538,63 @@ int main(int argc, char** argv) { mesh.local_to_global_node_mapping.update_device(); mesh.local_to_global_elem_mapping.update_device(); - // in kernel, I will do the following - // On each rank, I need: - // 1. Numnber of nodes - // 2. node coordinates - // 3. number of elements - // 5. Local node to global node mapping - // 6. Local element to global element mapping - // 7. Element-node connectivity - // With the above, I can call build connectivity on the local mesh + // rebuild the local element-node connectivity using the local node ids + for(int i = 0; i < num_elements_on_rank; i++) { + for(int j = 0; j < num_nodes_per_elem; j++) { + int node_gid = nodes_in_elem_on_rank[i * num_nodes_per_elem + j]; - // elements_on_rank is now received via MPI_Scatterv above + int node_lid = -1; - + // Search through the local to global mapp to find the equivalent local index + for(int k = 0; k < num_nodes_on_rank; k++){ + + if(node_gid == mesh.local_to_global_node_mapping.host(k)) { + node_lid = k; + break; + } + } + + mesh.nodes_in_elem.host(i, j) = node_lid; + } + } + + + mesh.nodes_in_elem.update_device(); +// ****************************************************************************************** +// Build the connectivity for the local mesh +// ****************************************************************************************** - // if (rank == 0) std::cout<<"Finished"< +#include +#include +#include +#include +#include // for string pattern recoginition +#include +#include +#include +#include +#include @@ -35,6 +45,61 @@ inline int get_id(int i, int j, int k, int num_i, int num_j) return i + j * num_i + k * num_i * num_j; } +///////////////////////////////////////////////////////////////////////////// +/// +/// \fn PointIndexFromIJK +/// +/// \brief Given (i,j,k) coordinates within the Lagrange hex, return an +/// offset into the local connectivity (PointIds) array. The order parameter +/// must point to an array of 3 integers specifying the order along each +/// axis of the hexahedron. +/// +///////////////////////////////////////////////////////////////////////////// +inline int PointIndexFromIJK(int i, int j, int k, const int* order) +{ + bool ibdy = (i == 0 || i == order[0]); + bool jbdy = (j == 0 || j == order[1]); + bool kbdy = (k == 0 || k == order[2]); + // How many boundaries do we lie on at once? + int nbdy = (ibdy ? 1 : 0) + (jbdy ? 1 : 0) + (kbdy ? 1 : 0); + + if (nbdy == 3) { // Vertex DOF + // ijk is a corner node. Return the proper index (somewhere in [0,7]): + return (i ? (j ? 2 : 1) : (j ? 3 : 0)) + (k ? 4 : 0); + } + + int offset = 8; + if (nbdy == 2) { // Edge DOF + if (!ibdy) { // On i axis + return (i - 1) + (j ? order[0] - 1 + order[1] - 1 : 0) + (k ? 2 * (order[0] - 1 + order[1] - 1) : 0) + offset; + } + if (!jbdy) { // On j axis + return (j - 1) + (i ? order[0] - 1 : 2 * (order[0] - 1) + order[1] - 1) + (k ? 2 * (order[0] - 1 + order[1] - 1) : 0) + offset; + } + // !kbdy, On k axis + offset += 4 * (order[0] - 1) + 4 * (order[1] - 1); + return (k - 1) + (order[2] - 1) * (i ? (j ? 3 : 1) : (j ? 2 : 0)) + offset; + } + + offset += 4 * (order[0] - 1 + order[1] - 1 + order[2] - 1); + if (nbdy == 1) { // Face DOF + if (ibdy) { // On i-normal face + return (j - 1) + ((order[1] - 1) * (k - 1)) + (i ? (order[1] - 1) * (order[2] - 1) : 0) + offset; + } + offset += 2 * (order[1] - 1) * (order[2] - 1); + if (jbdy) { // On j-normal face + return (i - 1) + ((order[0] - 1) * (k - 1)) + (j ? (order[2] - 1) * (order[0] - 1) : 0) + offset; + } + offset += 2 * (order[2] - 1) * (order[0] - 1); + // kbdy, On k-normal face + return (i - 1) + ((order[0] - 1) * (j - 1)) + (k ? (order[0] - 1) * (order[1] - 1) : 0) + offset; + } + + // nbdy == 0: Body DOF + offset += 2 * ( (order[1] - 1) * (order[2] - 1) + (order[2] - 1) * (order[0] - 1) + (order[0] - 1) * (order[1] - 1)); + return offset + (i - 1) + (order[0] - 1) * ( (j - 1) + (order[1] - 1) * ( (k - 1))); +} + ///////////////////////////////////////////////////////////////////////////// /// /// \fn build_3d_box @@ -163,4 +228,268 @@ void build_3d_box( mesh.build_connectivity(); } // end build_3d_box + + +///////////////////////////////////////////////////////////////////////////// + /// + /// \fn write_vtk + /// + /// \brief Writes a vtk output file + /// + /// \param mesh mesh + /// \param node node data + /// \param rank rank + /// + ///////////////////////////////////////////////////////////////////////////// + void write_vtk(Mesh_t& mesh, + node_t& node, + int rank) + { + + CArray graphics_times(1); + int graphics_id = 0; + graphics_times(0) = 0.0; + + // ---- Update host data ---- + + // material point values + // State.MaterialPoints.den.update_host(); + // State.MaterialPoints.pres.update_host(); + // State.MaterialPoints.stress.update_host(); + // State.MaterialPoints.sspd.update_host(); + // State.MaterialPoints.sie.update_host(); + // State.MaterialPoints.mass.update_host(); + // State.MaterialPoints.conductivity.update_host(); + // State.MaterialPoints.temp_grad.update_host(); + // State.MaterialPoints.eroded.update_host(); + + + // gauss point values + // State.GaussPoints.vol.update_host(); + + // nodal values + node.coords.update_host(); + // State.node.vel.update_host(); + // State.node.mass.update_host(); + // State.node.temp.update_host(); + + Kokkos::fence(); + + + const int num_cell_scalar_vars = 1; + const int num_cell_vec_vars = 0; + const int num_cell_tensor_vars = 0; + + const int num_point_scalar_vars = 1; + const int num_point_vec_vars = 1; + + + // Scalar values associated with a cell + const char cell_scalar_var_names[num_cell_scalar_vars][15] = { + "rank_id" + }; + + // const char cell_vec_var_names[num_cell_vec_vars][15] = { + + // }; + + const char point_scalar_var_names[num_point_scalar_vars][15] = { + "rank_id" + }; + + const char point_vec_var_names[num_point_vec_vars][15] = { + "pos" + }; + + // short hand + const size_t num_nodes = mesh.num_nodes; + const size_t num_elems = mesh.num_elems; + const size_t num_dims = mesh.num_dims; + + // save the cell state to an array for exporting to graphics files + auto elem_fields = CArray(num_elems, num_cell_scalar_vars); + int elem_switch = 1; + + + // save the output scale fields to a single 2D array + + + // export material centeric data to the elements + elem_fields(0, 0) = rank; + + + // save the vertex vector fields to an array for exporting to graphics files + CArray vec_fields(num_nodes, num_point_vec_vars, 3); + CArray point_scalar_fields(num_nodes, num_point_scalar_vars); + + for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) { + // position, var 0 + vec_fields(node_gid, 0, 0) = node.coords.host(node_gid, 0); + vec_fields(node_gid, 0, 1) = node.coords.host(node_gid, 1); + vec_fields(node_gid, 0, 2) = node.coords.host(node_gid, 2); + + point_scalar_fields(node_gid, 0) = rank; + } // end for loop over vertices + + + FILE* out[20]; // the output files that are written to + char filename[100]; // char string + int max_len = sizeof filename; + int str_output_len; + + struct stat st; + + if (stat("vtk", &st) != 0) { + system("mkdir vtk"); + } + + // snprintf(filename, max_len, "ensight/data/%s.%05d.%s", name, graphics_id, vec_var_names[var]); + + //sprintf(filename, "vtk/Fierro.%05d.vtk", graphics_id); // mesh file + str_output_len = snprintf(filename, max_len, "vtk/Fierro.%05d_rank%d.vtk", graphics_id, rank); + if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); } + // mesh file + + out[0] = fopen(filename, "w"); + + fprintf(out[0], "# vtk DataFile Version 2.0\n"); // part 2 + fprintf(out[0], "Mesh for Fierro\n"); // part 2 + fprintf(out[0], "ASCII \n"); // part 3 + fprintf(out[0], "DATASET UNSTRUCTURED_GRID\n\n"); // part 4 + + fprintf(out[0], "POINTS %zu float\n", mesh.num_nodes); + + // write all components of the point coordinates + for (size_t node_gid = 0; node_gid < mesh.num_nodes; node_gid++) { + fprintf(out[0], + "%f %f %f\n", + node.coords.host(node_gid, 0), + node.coords.host(node_gid, 1), + node.coords.host(node_gid, 2)); + } // end for + + /* + --------------------------------------------------------------------------- + Write the elems + --------------------------------------------------------------------------- + */ + + fprintf(out[0], "\n"); + fprintf(out[0], "CELLS %lu %lu\n", mesh.num_elems, mesh.num_elems + mesh.num_elems * mesh.num_nodes_in_elem); // size=all printed values + + int Pn_order = mesh.Pn; + int order[3] = { Pn_order, Pn_order, Pn_order }; + + // const int num_1D_points = Pn_order+1; + + // write all global point numbers for this elem + for (size_t elem_gid = 0; elem_gid < mesh.num_elems; elem_gid++) { + fprintf(out[0], "%lu ", mesh.num_nodes_in_elem); // num points in this elem + + for (int k = 0; k <= Pn_order; k++) { + for (int j = 0; j <= Pn_order; j++) { + for (int i = 0; i <= Pn_order; i++) { + size_t node_lid = PointIndexFromIJK(i, j, k, order); + fprintf(out[0], "%lu ", mesh.nodes_in_elem.host(elem_gid, node_lid)); + } + } + } + + fprintf(out[0], "\n"); + } // end for + + // Write the element types + fprintf(out[0], "\n"); + fprintf(out[0], "CELL_TYPES %zu \n", mesh.num_elems); + // VTK_LAGRANGE_HEXAHEDRON: 72, + // VTK_HIGHER_ORDER_HEXAHEDRON: 67 + // VTK_BIQUADRATIC_QUADRATIC_HEXAHEDRON = 33 + // element types: https://vtk.org/doc/nightly/html/vtkCellType_8h_source.html + // element types: https://kitware.github.io/vtk-js/api/Common_DataModel_CellTypes.html + // vtk format: https://www.kitware.com//modeling-arbitrary-order-lagrange-finite-elements-in-the-visualization-toolkit/ + for (size_t elem_gid = 0; elem_gid < mesh.num_elems; elem_gid++) { + fprintf(out[0], "%d \n", 72); + } + + /* + --------------------------------------------------------------------------- + Write the nodal vector variables to file + --------------------------------------------------------------------------- + */ + + fprintf(out[0], "\n"); + fprintf(out[0], "POINT_DATA %zu \n", mesh.num_nodes); + + // vtk vector vars = (position, velocity) + for (int var = 0; var < num_point_vec_vars; var++) { + fprintf(out[0], "VECTORS %s float \n", point_vec_var_names[var]); + for (size_t node_gid = 0; node_gid < mesh.num_nodes; node_gid++) { + fprintf(out[0], "%f %f %f\n", + vec_fields(node_gid, var, 0), + vec_fields(node_gid, var, 1), + vec_fields(node_gid, var, 2)); + } // end for nodes + } // end for vec_vars + + + // vtk scalar vars = (temp) + for (int var = 0; var < num_point_scalar_vars; var++) { + fprintf(out[0], "SCALARS %s float 1\n", point_scalar_var_names[var]); + fprintf(out[0], "LOOKUP_TABLE default\n"); + for (size_t node_gid = 0; node_gid < mesh.num_nodes; node_gid++) { + fprintf(out[0], "%f\n", + point_scalar_fields(node_gid, 0)); + } // end for nodes + } // end for vec_vars + + /* + --------------------------------------------------------------------------- + Write the scalar elem variable to file + --------------------------------------------------------------------------- + */ + fprintf(out[0], "\n"); + fprintf(out[0], "CELL_DATA %zu \n", mesh.num_elems); + + for (int var = 0; var < num_cell_scalar_vars; var++) { + fprintf(out[0], "SCALARS %s float 1\n", cell_scalar_var_names[var]); // the 1 is number of scalar components [1:4] + fprintf(out[0], "LOOKUP_TABLE default\n"); + for (size_t elem_gid = 0; elem_gid < mesh.num_elems; elem_gid++) { + fprintf(out[0], "%f\n", rank); + } // end for elem + } // end for cell scalar_vars + + fclose(out[0]); + + // graphics_times(graphics_id) = time_value; + + // Write time series metadata + //sprintf(filename, "vtk/Fierro.vtk.series", graphics_id); // mesh file + str_output_len = snprintf(filename, max_len, "vtk/Fierro.vtk.series"); + if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); } + // mesh file + + out[0] = fopen(filename, "w"); + + fprintf(out[0], "{\n"); + fprintf(out[0], " \"file-series-version\" : \"1.0\",\n"); + fprintf(out[0], " \"files\" : [\n"); + + for (int i = 0; i <= graphics_id; i++) { + fprintf(out[0], " { \"name\" : \"Fierro.%05d.vtk\", \"time\" : %12.5e },\n", i, graphics_times(i) ); + } + + // fprintf(out[0], "%12.5e\n", graphics_times(i)); + fprintf(out[0], " ]\n"); // part 4 + fprintf(out[0], "}"); // part 4 + + fclose(out[0]); + + // increment graphics id counter + // graphics_id++; + + + } // end write vtk old + + + #endif \ No newline at end of file From 3f30bcda93284117e5c819ba203416bfadf095a2 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Mon, 27 Oct 2025 15:35:38 -0500 Subject: [PATCH 05/52] ENH: PTScotch now partitioning mesh, WIP --- examples/mesh_decomp/CMakeLists.txt | 18 +- examples/mesh_decomp/install_ptscotch.sh | 5 +- examples/mesh_decomp/mesh_decomp.cpp | 316 +++++++++++++++++++++-- 3 files changed, 318 insertions(+), 21 deletions(-) diff --git a/examples/mesh_decomp/CMakeLists.txt b/examples/mesh_decomp/CMakeLists.txt index b002a355..7b7306cd 100644 --- a/examples/mesh_decomp/CMakeLists.txt +++ b/examples/mesh_decomp/CMakeLists.txt @@ -26,7 +26,19 @@ if (KOKKOS) # Add include directories for MPI and Scotch/PT-Scotch target_include_directories(mesh_decomp PRIVATE ${MPI_CXX_INCLUDE_PATH} ${CMAKE_CURRENT_LIST_DIR}/lib/scotch/build/src/include) - # Link libraries - target_link_libraries(mesh_decomp ${LINKING_LIBRARIES} MPI::MPI_CXX) - target_link_directories(mesh_decomp PRIVATE ${CMAKE_CURRENT_LIST_DIR}/lib/scotch/build/src/lib) + # Link libraries - order matters! libptscotch depends on libscotch + # Use -Wl,--whole-archive to ensure all symbols are included from static libraries + # Note: Only link libptscotcherr.a (not libscotcherr.a) to avoid multiple definitions + target_link_libraries(mesh_decomp ${LINKING_LIBRARIES} MPI::MPI_CXX + -Wl,--whole-archive + ${CMAKE_CURRENT_LIST_DIR}/lib/scotch/build/lib/libscotch.a + -Wl,--no-whole-archive + -Wl,--whole-archive + ${CMAKE_CURRENT_LIST_DIR}/lib/scotch/build/lib/libptscotcherr.a + ${CMAKE_CURRENT_LIST_DIR}/lib/scotch/build/lib/libptscotch.a + -Wl,--no-whole-archive + -lz # zlib for gzip compression + -lbz2 # bzip2 library + -llzma # xz compression library + ) endif() diff --git a/examples/mesh_decomp/install_ptscotch.sh b/examples/mesh_decomp/install_ptscotch.sh index 00d29df9..29d3f853 100755 --- a/examples/mesh_decomp/install_ptscotch.sh +++ b/examples/mesh_decomp/install_ptscotch.sh @@ -27,7 +27,10 @@ cd scotch echo "Building Scotch..." mkdir build cd build -cmake .. +cmake .. -DCMAKE_BUILD_TYPE=Release \ + -DSCOTCH_MPI=ON \ + -DMPI_C_COMPILER=mpicc \ + -DMPI_Fortran_COMPILER=mpifort make echo "Installation complete! Libraries installed in: ${INSTALL_PREFIX}" \ No newline at end of file diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp index 5b6635b7..34a1c683 100644 --- a/examples/mesh_decomp/mesh_decomp.cpp +++ b/examples/mesh_decomp/mesh_decomp.cpp @@ -96,7 +96,7 @@ int main(int argc, char** argv) { timer.start(); bool print_info = true; - bool print_vtk = true; + bool print_vtk = false; MPI_Init(&argc, &argv); @@ -154,7 +154,7 @@ int main(int argc, char** argv) { if (rank == 0) { std::cout<<"World size: "< elem_elem_counts(world_size); + int total_elem_elem_entries = 0; + + + if (rank == 0){ + // Calculate total number of connectivity entries for each rank + for(int i = 0; i < world_size; i++) { + elem_elem_counts[i] = 0; + for(int k = 0; k < elements_to_send[i].size(); k++) { + elem_elem_counts[i] += initial_mesh.num_elems_in_elem(elements_to_send[i][k]); + } + + std::cout << "Rank " << i << " will receive " << elem_elem_counts[i] << " element-element connectivity entries" << std::endl; + } + + // Print element-element connectivity entries for each rank in the initial mesh + for(int i = 0; i < world_size; i++) { + std::cout << std::endl; + std::cout << "Rank " << i << " will receive element-element connectivity entries for the following elements: "< elems_in_elem_on_rank(total_elem_elem_entries); + + // Now scatter the num_elems_in_elem for each element on each rank + std::vector num_elems_in_elem_per_rank(num_elements_on_rank); + + if (rank == 0) { + std::vector all_num_elems_in_elem; + std::vector displs_ee(world_size); + int displacement = 0; + + for(int i = 0; i < world_size; i++) { + displs_ee[i] = displacement; + for(int k = 0; k < elements_to_send[i].size(); k++) { + all_num_elems_in_elem.push_back(initial_mesh.num_elems_in_elem(elements_to_send[i][k])); + } + displacement += elements_to_send[i].size(); + } + + MPI_Scatterv(all_num_elems_in_elem.data(), elems_per_rank.data(), displs_ee.data(), MPI_INT, + num_elems_in_elem_per_rank.data(), num_elements_on_rank, MPI_INT, + 0, MPI_COMM_WORLD); + } else { + MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT, + num_elems_in_elem_per_rank.data(), num_elements_on_rank, MPI_INT, + 0, MPI_COMM_WORLD); + } + + if (rank == 0){ + // Prepare the element-element connectivity data for each rank + std::vector all_elems_in_elem; + std::vector sendcounts(world_size); + std::vector displs(world_size); + + int displacement = 0; + + for(int i = 0; i < world_size; i++) { + sendcounts[i] = elem_elem_counts[i]; + displs[i] = displacement; + + // Copy element-element connectivity for rank i + for(int k = 0; k < elements_to_send[i].size(); k++) { + for(int l = 0; l < initial_mesh.num_elems_in_elem(elements_to_send[i][k]); l++) { + all_elems_in_elem.push_back(initial_mesh.elems_in_elem(elements_to_send[i][k], l)); + } + } + displacement += elem_elem_counts[i]; + } + + // Send the element-element connectivity data to each rank using MPI_Scatterv + MPI_Scatterv(all_elems_in_elem.data(), sendcounts.data(), displs.data(), MPI_INT, + elems_in_elem_on_rank.data(), total_elem_elem_entries, MPI_INT, + 0, MPI_COMM_WORLD); + } + else { + MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT, + elems_in_elem_on_rank.data(), total_elem_elem_entries, MPI_INT, + 0, MPI_COMM_WORLD); + } + + MPI_Barrier(MPI_COMM_WORLD); + + if (rank == 0 && print_info) { + std::cout << "Rank " << rank << " received element-element connectivity (" + << num_elements_on_rank << " elements, " << elems_in_elem_on_rank.size() << " entries):" << std::endl; + + int offset = 0; + for (int elem = 0; elem < num_elements_on_rank; elem++) { + std::cout << " Element " << elem << " has neighbors: "; + int num_neighbors = num_elems_in_elem_per_rank[elem]; + for (int j = 0; j < num_neighbors; j++) { + std::cout << elems_in_elem_on_rank[offset + j] << " "; + } + offset += num_neighbors; + std::cout << std::endl; + } + } + + MPI_Barrier(MPI_COMM_WORLD); + + if (rank == 1 && print_info) { + std::cout << "Rank " << rank << " received element-element connectivity (" + << num_elements_on_rank << " elements, " << elems_in_elem_on_rank.size() << " entries):" << std::endl; + + int offset = 0; + for (int elem = 0; elem < num_elements_on_rank; elem++) { + std::cout << " Element " << elem << " has neighbors: "; + int num_neighbors = num_elems_in_elem_per_rank[elem]; + for (int j = 0; j < num_neighbors; j++) { + std::cout << elems_in_elem_on_rank[offset + j] << " "; + } + offset += num_neighbors; + std::cout << std::endl; + } + } + MPI_Barrier(MPI_COMM_WORLD); // ****************************************************************************************** // Initialize the mesh data structures for each rank @@ -585,7 +711,163 @@ int main(int argc, char** argv) { write_vtk(mesh, node, rank); } - + +// ****************************************************************************************** +// Repartition the mesh using pt-scotch +// ****************************************************************************************** + + + + // --- Simple compact CSR build using global neighbor GIDs (recommended) --- + SCOTCH_Dgraph dgraph; + if (SCOTCH_dgraphInit(&dgraph, MPI_COMM_WORLD) != 0) { + std::cerr << "[rank " << rank << "] SCOTCH_dgraphInit failed\n"; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + const SCOTCH_Num baseval = 0; // 0-based + const SCOTCH_Num vertlocnbr = static_cast(mesh.num_elems); + const SCOTCH_Num vertlocmax = vertlocnbr; // no holes + + // Build compact CSR: vertloctab (size vertlocnbr+1) and edgeloctab (neighbors as GLOBAL elem GIDs) + std::vector vertloctab(vertlocnbr + 1); + std::vector edgeloctab; + edgeloctab.reserve(vertlocnbr * 6); // heuristic reserve + + // Build the graph from elems_in_elem_on_rank which contains global neighbor IDs + // First, create a map from element GID to its position in elems_in_elem_on_rank + std::map elem_gid_to_offset; + size_t current_offset = 0; + for (size_t k = 0; k < num_elements_on_rank; k++) { + elem_gid_to_offset[elements_on_rank[k]] = current_offset; + current_offset += num_elems_in_elem_per_rank[k]; + } + + SCOTCH_Num offset = 0; + for (size_t lid = 0; lid < mesh.num_elems; ++lid) { + vertloctab[lid] = offset; + + // Get local element's global ID + int elem_gid = mesh.local_to_global_elem_mapping.host(lid); + + // Get the offset in elems_in_elem_on_rank for this element + size_t elems_in_elem_offset = elem_gid_to_offset[elem_gid]; + + // Get neighbor count - need to find the right index in elements_on_rank + size_t idx = 0; + for (size_t k = 0; k < num_elements_on_rank; k++) { + if (elements_on_rank[k] == elem_gid) { + idx = k; + break; + } + } + size_t num_nbrs = num_elems_in_elem_per_rank[idx]; + + for (size_t j = 0; j < num_nbrs; ++j) { + // Get global neighbor ID from elems_in_elem_on_rank + size_t neighbor_gid = elems_in_elem_on_rank[elems_in_elem_offset + j]; + edgeloctab.push_back(static_cast(neighbor_gid)); + ++offset; + } + } + vertloctab[vertlocnbr] = offset; + const SCOTCH_Num edgelocnbr = offset; + const SCOTCH_Num edgelocsiz = edgelocnbr; + + // Debug: print graph structure + if (print_info) { + std::cout << "Rank " << rank << ": vertlocnbr=" << vertlocnbr << ", edgelocnbr=" << edgelocnbr << std::endl; + std::cout << "vertloctab: "; + for (size_t i = 0; i <= vertlocnbr; i++) { + std::cout << vertloctab[i] << " "; + } + std::cout << std::endl; + std::cout << "edgeloctab (first 20): "; + for (size_t i = 0; i < std::min((size_t)20, edgeloctab.size()); i++) { + std::cout << edgeloctab[i] << " "; + } + std::cout << std::endl; + } + MPI_Barrier(MPI_COMM_WORLD); + + // NOTE: Using compact CSR => pass vendloctab = nullptr, vlblloctab = nullptr. + // edgeloctab contains GLOBAL neighbor IDs; SCOTCH will discover remote vertices itself. + int rc = SCOTCH_dgraphBuild(&dgraph, + baseval, + vertlocnbr, + vertlocmax, + vertloctab.data(), // compact offsets + /*vendloctab*/ nullptr, + /*veloloctab*/ nullptr, + /*vlblloctab*/ nullptr, + edgelocnbr, + edgelocsiz, + edgeloctab.data(), + /*edgegsttab*/ nullptr, + /*edloloctab*/ nullptr); + if (rc != 0) { + std::cerr << "[rank " << rank << "] SCOTCH_dgraphBuild failed rc=" << rc << "\n"; + SCOTCH_dgraphFree(&dgraph); + MPI_Abort(MPI_COMM_WORLD, rc); + } + + // Print graph info after build but before check + if (print_info) { + SCOTCH_Num vertlocnbr_out, vertloctab_size; + SCOTCH_dgraphSize(&dgraph, &vertlocnbr_out, nullptr, nullptr, nullptr); + std::cout << "Rank " << rank << ": After dgraphBuild, vertlocnbr=" << vertlocnbr_out << std::endl; + } + MPI_Barrier(MPI_COMM_WORLD); + + // Sanity check + rc = SCOTCH_dgraphCheck(&dgraph); + if (rc != 0) { + std::cerr << "[rank " << rank << "] SCOTCH_dgraphCheck failed rc=" << rc << "\n"; + SCOTCH_dgraphFree(&dgraph); + MPI_Abort(MPI_COMM_WORLD, rc); + } + + // Partition the mesh using pt-scotch + // Partition into world_size parts + // Note: Since we already have a distributed mesh, we're asking for a repartition + SCOTCH_Arch archdat; + SCOTCH_archInit(&archdat); + SCOTCH_archCmplt(&archdat, static_cast(world_size)); + + SCOTCH_Strat stratdat; + SCOTCH_stratInit(&stratdat); + + std::vector partloctab(vertlocnbr); + rc = SCOTCH_dgraphMap(&dgraph, &archdat, &stratdat, partloctab.data()); + if (rc != 0) { + std::cerr << "[rank " << rank << "] SCOTCH_dgraphMap failed rc=" << rc << "\n"; + SCOTCH_stratExit(&stratdat); + SCOTCH_archExit(&archdat); + SCOTCH_dgraphFree(&dgraph); + MPI_Abort(MPI_COMM_WORLD, rc); + } + + SCOTCH_stratExit(&stratdat); + SCOTCH_archExit(&archdat); + + // Print partition assignment (optional) + for (size_t lid = 0; lid < mesh.num_elems; ++lid) { + size_t gid = mesh.local_to_global_elem_mapping.host(lid); + std::cout << "[rank " << rank << "] elem_local=" << lid << " gid=" << gid + << " -> part=" << partloctab[lid] << "\n"; + } + + + + MPI_Barrier(MPI_COMM_WORLD); + + + + + + + + } // end MATAR scope MATAR_FINALIZE(); From 6145513dd7ccd49772a50b02f6f5c10c8fc2c755 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Mon, 27 Oct 2025 15:58:17 -0500 Subject: [PATCH 06/52] DOC: Adding documentation and comments for future Jacob, WIP --- examples/mesh_decomp/mesh_decomp.cpp | 228 +++++++++++++++++++-------- 1 file changed, 166 insertions(+), 62 deletions(-) diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp index 34a1c683..1539fbb6 100644 --- a/examples/mesh_decomp/mesh_decomp.cpp +++ b/examples/mesh_decomp/mesh_decomp.cpp @@ -112,17 +112,21 @@ int main(int argc, char** argv) { // Initial mesh size double origin[3] = {0.0, 0.0, 0.0}; double length[3] = {1.0, 1.0, 1.0}; - int num_elems_dim[3] = {2, 2, 2}; + int num_elems_dim[3] = {4, 4, 4}; Mesh_t initial_mesh; GaussPoint_t initial_GaussPoints; node_t initial_node; // Create mesh, gauss points, and node data structures on each rank + // This is the initial partitioned mesh Mesh_t mesh; GaussPoint_t GaussPoints; node_t node; + // Mesh partitioned by pt-scotch + Mesh_t final_mesh; + int num_elements_on_rank = 0; int num_nodes_on_rank = 0; @@ -718,42 +722,112 @@ int main(int argc, char** argv) { - // --- Simple compact CSR build using global neighbor GIDs (recommended) --- + /********************************************************************************** + * Build PT-Scotch distributed graph representation of the mesh for repartitioning * + ********************************************************************************** + * + * This section constructs the distributed graph (SCOTCH_Dgraph) needed by PT-Scotch + * for mesh repartitioning. In this graph, each mesh element is a vertex, and edges + * correspond to mesh-neighbor relationships (i.e., elements that share a face or are + * otherwise neighbors per your mesh definition). + * + * We use the compact CSR (Compressed Sparse Row) representation, passing only the + * essential information required by PT-Scotch. + * + * Variables and structures used: + * - SCOTCH_Dgraph dgraph: + * The distributed graph instance managed by PT-Scotch. Each MPI rank creates + * and fills in its portion of the global graph. + * + * - const SCOTCH_Num baseval: + * The base value for vertex and edge numbering. Set to 0 for C-style zero-based + * arrays. Always use 0 unless you are using Fortran style 1-based arrays. + * + * - const SCOTCH_Num vertlocnbr: + * The *number of local vertices* (mesh elements) defined on this MPI rank. + * In our mesh, this is mesh.num_elems. PT-Scotch expects each rank to specify + * its own local vertex count. + * + * - const SCOTCH_Num vertlocmax: + * The *maximum number of local vertices* that could be stored (capacity). We + * allocate with no unused holes, so vertlocmax = vertlocnbr. + * + * - std::vector vertloctab: + * CSR array [size vertlocnbr+1]: for each local vertex i, vertloctab[i] + * gives the index in edgeloctab where the neighbor list of vertex i begins. + * PT-Scotch expects this array to be of size vertlocnbr+1, where the difference + * vertloctab[i+1] - vertloctab[i] gives the number of edges for vertex i. + * + * - std::vector edgeloctab: + * CSR array [variable size]: a flattened list of *neighboring element global IDs*, + * in no particular order. For vertex i, its neighbors are located at + * edgeloctab[vertloctab[i]...vertloctab[i+1]-1]. + * In this compact CSR, these are global IDs (GIDs), enabling PT-Scotch to + * recognize edges both within and across ranks. + * + * - std::map elem_gid_to_offset: + * Helper map: For a given element global ID, gives the starting offset in + * the flattened neighbor array (elems_in_elem_on_rank) where this element's + * list of neighbor GIDs begins. This allows efficient neighbor list lookup. + * + * - (other arrays used, from mesh setup and communication phase) + * - elements_on_rank: vector of global element IDs owned by this rank. + * - num_elements_on_rank: number of owned elements. + * - num_elems_in_elem_per_rank: array, for each owned element, how many + * neighbors it has. + * - elems_in_elem_on_rank: flattened array of global neighbor IDs for all local elements. + * + **********************************************************************************/ + + // --- Step 1: Initialize the PT-Scotch distributed graph object on this MPI rank --- SCOTCH_Dgraph dgraph; if (SCOTCH_dgraphInit(&dgraph, MPI_COMM_WORLD) != 0) { std::cerr << "[rank " << rank << "] SCOTCH_dgraphInit failed\n"; MPI_Abort(MPI_COMM_WORLD, 1); } - const SCOTCH_Num baseval = 0; // 0-based + // Set base value for numbering (0 for C-style arrays) + const SCOTCH_Num baseval = 0; + + // vertlocnbr: Number of elements (vertices) that are local to this MPI rank const SCOTCH_Num vertlocnbr = static_cast(mesh.num_elems); - const SCOTCH_Num vertlocmax = vertlocnbr; // no holes - // Build compact CSR: vertloctab (size vertlocnbr+1) and edgeloctab (neighbors as GLOBAL elem GIDs) + // vertlocmax: Maximum possible local vertices (no holes, so identical to vertlocnbr) + const SCOTCH_Num vertlocmax = vertlocnbr; + + // --- Step 2: Build compact CSR arrays for PT-Scotch (vertloctab, edgeloctab) --- + // vertloctab: for each local mesh element [vertex], gives index in edgeloctab where its neighbor list begins std::vector vertloctab(vertlocnbr + 1); + + // edgeloctab: flat array of neighbor global IDs for all local elements, built in order std::vector edgeloctab; - edgeloctab.reserve(vertlocnbr * 6); // heuristic reserve + edgeloctab.reserve(vertlocnbr * 6); // heuristic: assume typical mesh degree is ~6, for performance - // Build the graph from elems_in_elem_on_rank which contains global neighbor IDs - // First, create a map from element GID to its position in elems_in_elem_on_rank + // Construct a map from element GID to its offset into elems_in_elem_on_rank (the array of neighbor GIDs) + // This allows, for a given element GID, quick lookup of where its neighbor list starts in the flat array. std::map elem_gid_to_offset; size_t current_offset = 0; for (size_t k = 0; k < num_elements_on_rank; k++) { elem_gid_to_offset[elements_on_rank[k]] = current_offset; current_offset += num_elems_in_elem_per_rank[k]; } - - SCOTCH_Num offset = 0; + + // --- Step 3: Fill in the CSR arrays, looping over each locally-owned element --- + SCOTCH_Num offset = 0; // running count of edges encountered + for (size_t lid = 0; lid < mesh.num_elems; ++lid) { + + // Record current edge offset for vertex lid in vertloctab vertloctab[lid] = offset; - // Get local element's global ID + // Obtain this local element's global ID (from mapping) int elem_gid = mesh.local_to_global_elem_mapping.host(lid); - - // Get the offset in elems_in_elem_on_rank for this element + + // Find offset in the flattened neighbor array for this element's neighbor list size_t elems_in_elem_offset = elem_gid_to_offset[elem_gid]; - - // Get neighbor count - need to find the right index in elements_on_rank + + // For this element, find the count of its neighbors + // This requires finding its index in the elements_on_rank array size_t idx = 0; for (size_t k = 0; k < num_elements_on_rank; k++) { if (elements_on_rank[k] == elem_gid) { @@ -762,27 +836,33 @@ int main(int argc, char** argv) { } } size_t num_nbrs = num_elems_in_elem_per_rank[idx]; - + + // Append each neighbor (by its GLOBAL elem GID) to edgeloctab for (size_t j = 0; j < num_nbrs; ++j) { - // Get global neighbor ID from elems_in_elem_on_rank - size_t neighbor_gid = elems_in_elem_on_rank[elems_in_elem_offset + j]; + size_t neighbor_gid = elems_in_elem_on_rank[elems_in_elem_offset + j]; // This is a global element ID! edgeloctab.push_back(static_cast(neighbor_gid)); - ++offset; + ++offset; // Increment running edge count } } + + // vertloctab[vertlocnbr] stores total number of edges written, finalizes the CSR structure vertloctab[vertlocnbr] = offset; - const SCOTCH_Num edgelocnbr = offset; - const SCOTCH_Num edgelocsiz = edgelocnbr; - // Debug: print graph structure + // edgelocnbr/edgelocsiz: Number of edge endpoints defined locally + // (PT-Scotch's distributed graphs allow edges to be replicated or owned by either endpoint) + const SCOTCH_Num edgelocnbr = offset; // total number of edge endpoints (sum of all local neighbor degrees) + const SCOTCH_Num edgelocsiz = edgelocnbr; // allocated size matches number of endpoints + + // Optionally print graph structure for debugging/validation if (print_info) { - std::cout << "Rank " << rank << ": vertlocnbr=" << vertlocnbr << ", edgelocnbr=" << edgelocnbr << std::endl; - std::cout << "vertloctab: "; + std::cout << "Rank " << rank << ": vertlocnbr = # of local elements(vertices) = " << vertlocnbr + << ", edgelocnbr = # of local edge endpoints = " << edgelocnbr << std::endl; + std::cout << "vertloctab (CSR row offsets): "; for (size_t i = 0; i <= vertlocnbr; i++) { std::cout << vertloctab[i] << " "; } std::cout << std::endl; - std::cout << "edgeloctab (first 20): "; + std::cout << "edgeloctab (first 20 neighbor GIDs): "; for (size_t i = 0; i < std::min((size_t)20, edgeloctab.size()); i++) { std::cout << edgeloctab[i] << " "; } @@ -790,36 +870,48 @@ int main(int argc, char** argv) { } MPI_Barrier(MPI_COMM_WORLD); - // NOTE: Using compact CSR => pass vendloctab = nullptr, vlblloctab = nullptr. - // edgeloctab contains GLOBAL neighbor IDs; SCOTCH will discover remote vertices itself. - int rc = SCOTCH_dgraphBuild(&dgraph, - baseval, - vertlocnbr, - vertlocmax, - vertloctab.data(), // compact offsets - /*vendloctab*/ nullptr, - /*veloloctab*/ nullptr, - /*vlblloctab*/ nullptr, - edgelocnbr, - edgelocsiz, - edgeloctab.data(), - /*edgegsttab*/ nullptr, - /*edloloctab*/ nullptr); + /************************************************************************** + * Step 4: Build the distributed graph using PT-Scotch's SCOTCH_dgraphBuild + * + * - PT-Scotch will use our CSR arrays. Since we use compact representation, + * most optional arrays ("veloloctab", "vlblloctab", "edgegsttab", "edloloctab") + * can be passed as nullptr. + * - edgeloctab contains *GLOBAL element GIDs* of neighbors. PT-Scotch uses this + * to discover connections across processor boundaries, so you do not have to + * encode ownership or partition information yourself. + **************************************************************************/ + int rc = SCOTCH_dgraphBuild( + &dgraph, + baseval, // start index (0) + vertlocnbr, // local vertex count (local elements) + vertlocmax, // local vertex max (no holes) + vertloctab.data(), // row offsets in edgeloctab + /*vendloctab*/ nullptr, // end of row offsets (compact CSR => nullptr) + /*veloloctab*/ nullptr, // vertex weights, not used + /*vlblloctab*/ nullptr, // vertex global labels (we use GIDs in edgeloctab) + edgelocnbr, // local edge endpoints count + edgelocsiz, // size of edge array + edgeloctab.data(), // global neighbor IDs for each local node + /*edgegsttab*/ nullptr, // ghost edge array, not used + /*edloloctab*/ nullptr // edge weights, not used + ); if (rc != 0) { std::cerr << "[rank " << rank << "] SCOTCH_dgraphBuild failed rc=" << rc << "\n"; SCOTCH_dgraphFree(&dgraph); MPI_Abort(MPI_COMM_WORLD, rc); } - // Print graph info after build but before check + // Optionally, print rank summary after graph build for further validation if (print_info) { - SCOTCH_Num vertlocnbr_out, vertloctab_size; + SCOTCH_Num vertlocnbr_out; SCOTCH_dgraphSize(&dgraph, &vertlocnbr_out, nullptr, nullptr, nullptr); - std::cout << "Rank " << rank << ": After dgraphBuild, vertlocnbr=" << vertlocnbr_out << std::endl; + std::cout << "Rank " << rank << ": After dgraphBuild, vertlocnbr = " << vertlocnbr_out << std::endl; } MPI_Barrier(MPI_COMM_WORLD); - // Sanity check + /******************************************************** + * Step 5: Validate the graph using SCOTCH_dgraphCheck + ********************************************************/ rc = SCOTCH_dgraphCheck(&dgraph); if (rc != 0) { std::cerr << "[rank " << rank << "] SCOTCH_dgraphCheck failed rc=" << rc << "\n"; @@ -827,16 +919,21 @@ int main(int argc, char** argv) { MPI_Abort(MPI_COMM_WORLD, rc); } - // Partition the mesh using pt-scotch - // Partition into world_size parts - // Note: Since we already have a distributed mesh, we're asking for a repartition - SCOTCH_Arch archdat; + /************************************************************** + * Step 6: Partition (repartition) the mesh using PT-Scotch + * - Each vertex (mesh element) will be assigned a part (mesh chunk). + * - Arch is initialized for a complete graph of world_size parts (one per rank). + * - Loki + **************************************************************/ + SCOTCH_Arch archdat; // PT-Scotch architecture structure: describes desired partition topology SCOTCH_archInit(&archdat); - SCOTCH_archCmplt(&archdat, static_cast(world_size)); - - SCOTCH_Strat stratdat; + SCOTCH_archCmplt(&archdat, static_cast(world_size)); // Partition into world_size complete nodes + + SCOTCH_Strat stratdat; // PT-Scotch strategy object: holds partitioning options/settings SCOTCH_stratInit(&stratdat); - + + // partloctab: output array mapping each local element (vertex) to a *target partition number* + // After partitioning, partloctab[i] gives the part-assignment (in [0,world_size-1]) for local element i. std::vector partloctab(vertlocnbr); rc = SCOTCH_dgraphMap(&dgraph, &archdat, &stratdat, partloctab.data()); if (rc != 0) { @@ -846,20 +943,27 @@ int main(int argc, char** argv) { SCOTCH_dgraphFree(&dgraph); MPI_Abort(MPI_COMM_WORLD, rc); } - + + // Clean up PT-Scotch strategy and architecture objects SCOTCH_stratExit(&stratdat); SCOTCH_archExit(&archdat); - // Print partition assignment (optional) - for (size_t lid = 0; lid < mesh.num_elems; ++lid) { - size_t gid = mesh.local_to_global_elem_mapping.host(lid); - std::cout << "[rank " << rank << "] elem_local=" << lid << " gid=" << gid - << " -> part=" << partloctab[lid] << "\n"; + /*************************************************************************** + * Step 7 (Optional): Print out the partitioning assignment per element + * - Each local element's local index lid and global ID (gid) are listed with the + * part to which PT-Scotch has assigned them. + ***************************************************************************/ + for(int rank_id = 0; rank_id < world_size; rank_id++) { + if(rank_id == rank) { + for (size_t lid = 0; lid < mesh.num_elems; ++lid) { + size_t gid = mesh.local_to_global_elem_mapping.host(lid); + std::cout << "[rank " << rank_id << "] elem_local=" << lid << " gid=" << gid + << " -> part=" << partloctab[lid] << "\n"; + } + MPI_Barrier(MPI_COMM_WORLD); + } + MPI_Barrier(MPI_COMM_WORLD); } - - - - MPI_Barrier(MPI_COMM_WORLD); From 91b3b8dace7e8e5c7cdbf80e053784d10c6c2acb Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Mon, 27 Oct 2025 16:51:43 -0500 Subject: [PATCH 07/52] ENH: Debugging repartition, nodal coordinates seem off --- examples/mesh_decomp/mesh_decomp.cpp | 237 ++++++++++++++++++++++++++- 1 file changed, 234 insertions(+), 3 deletions(-) diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp index 1539fbb6..3cd4c709 100644 --- a/examples/mesh_decomp/mesh_decomp.cpp +++ b/examples/mesh_decomp/mesh_decomp.cpp @@ -51,7 +51,7 @@ void print_rank_mesh_info(Mesh_t& mesh, int rank) { std::cout<<"Mesh has "<> elems_to_send(world_size); + for (int lid = 0; lid < mesh.num_elems; ++lid) { + int dest = static_cast(partloctab[lid]); + int elem_gid = static_cast(mesh.local_to_global_elem_mapping.host(lid)); + elems_to_send[dest].push_back(elem_gid); + } + + // -------------- Phase 2: Exchange element GIDs -------------- + std::vector sendcounts(world_size), recvcounts(world_size); + for (int r = 0; r < world_size; ++r) + sendcounts[r] = static_cast(elems_to_send[r].size()); + + MPI_Alltoall(sendcounts.data(), 1, MPI_INT, recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); + + // Compute displacements + std::vector sdispls(world_size), rdispls(world_size); + int send_total = 0, recv_total = 0; + for (int r = 0; r < world_size; ++r) { + sdispls[r] = send_total; + rdispls[r] = recv_total; + send_total += sendcounts[r]; + recv_total += recvcounts[r]; + } + + + // Flatten send buffer + std::vector sendbuf; + sendbuf.reserve(send_total); + for (int r = 0; r < world_size; ++r) + sendbuf.insert(sendbuf.end(), elems_to_send[r].begin(), elems_to_send[r].end()); + + // Receive new local element GIDs + std::vector recvbuf(recv_total); + MPI_Alltoallv(sendbuf.data(), sendcounts.data(), sdispls.data(), MPI_INT, + recvbuf.data(), recvcounts.data(), rdispls.data(), MPI_INT, MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); + + // New elements owned by this rank + std::vector new_elem_gids = recvbuf; + int num_new_elems = static_cast(new_elem_gids.size()); + + + if (print_info) { + std::cout << "[rank " << rank << "] new elems: " << num_new_elems << std::endl; + } + + // -------------- Phase 3: Send element–node connectivity -------------- + int nodes_per_elem = mesh.num_nodes_in_elem; + + // Flatten element-node connectivity by global node IDs + std::vector conn_sendbuf; + for (int r = 0; r < world_size; ++r) { + for (int gid : elems_to_send[r]) { + // find local element lid from gid + int lid = -1; + for (int i = 0; i < mesh.num_elems; ++i) + if (mesh.local_to_global_elem_mapping.host(i) == gid) { lid = i; break; } + + for (int j = 0; j < nodes_per_elem; ++j) { + int node_lid = mesh.nodes_in_elem.host(lid, j); + int node_gid = mesh.local_to_global_node_mapping.host(node_lid); + conn_sendbuf.push_back(node_gid); + } + } + } + + // element-node connectivity counts (ints per dest rank) + std::vector conn_sendcounts(world_size), conn_recvcounts(world_size); + for (int r = 0; r < world_size; ++r) + conn_sendcounts[r] = sendcounts[r] * nodes_per_elem; + + MPI_Alltoall(conn_sendcounts.data(), 1, MPI_INT, conn_recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + + std::vector conn_sdispls(world_size), conn_rdispls(world_size); + int conn_send_total = 0, conn_recv_total = 0; + for (int r = 0; r < world_size; ++r) { + conn_sdispls[r] = conn_send_total; + conn_rdispls[r] = conn_recv_total; + conn_send_total += conn_sendcounts[r]; + conn_recv_total += conn_recvcounts[r]; + } + + std::vector conn_recvbuf(conn_recv_total); + MPI_Alltoallv(conn_sendbuf.data(), conn_sendcounts.data(), conn_sdispls.data(), MPI_INT, + conn_recvbuf.data(), conn_recvcounts.data(), conn_rdispls.data(), MPI_INT, MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); + + + // -------------- Phase 4: Build new node list (unique GIDs) -------------- + std::set node_gid_set(conn_recvbuf.begin(), conn_recvbuf.end()); + std::vector new_node_gids(node_gid_set.begin(), node_gid_set.end()); + int num_new_nodes = static_cast(new_node_gids.size()); + + // Build map gid→lid + std::unordered_map node_gid_to_lid; + for (int i = 0; i < num_new_nodes; ++i) + node_gid_to_lid[new_node_gids[i]] = i; + + if (print_info) + std::cout << "[rank " << rank << "] owns " << num_new_nodes << " unique nodes\n"; + // -------------- Phase 5: Request node coordinates -------------- + std::vector node_coords_sendbuf; + for (int r = 0; r < world_size; ++r) { + for (int gid : elems_to_send[r]) { + int lid = -1; + for (int i = 0; i < mesh.num_elems; ++i) + if (mesh.local_to_global_elem_mapping.host(i) == gid) { lid = i; break; } + for (int j = 0; j < nodes_per_elem; ++j) { + int node_lid = mesh.nodes_in_elem.host(lid, j); + int node_gid = mesh.local_to_global_node_mapping.host(node_lid); + + node_coords_sendbuf.push_back(node.coords.host(node_lid, 0)); + node_coords_sendbuf.push_back(node.coords.host(node_lid, 1)); + node_coords_sendbuf.push_back(node.coords.host(node_lid, 2)); + } + } + } + + // Each node is 3 doubles; same sendcounts scaling applies + std::vector coord_sendcounts(world_size), coord_recvcounts(world_size); + for (int r = 0; r < world_size; ++r) + coord_sendcounts[r] = sendcounts[r] * nodes_per_elem * 3; + + MPI_Alltoall(coord_sendcounts.data(), 1, MPI_INT, coord_recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + + std::vector coord_sdispls(world_size), coord_rdispls(world_size); + int coord_send_total = 0, coord_recv_total = 0; + for (int r = 0; r < world_size; ++r) { + coord_sdispls[r] = coord_send_total; + coord_rdispls[r] = coord_recv_total; + coord_send_total += coord_sendcounts[r]; + coord_recv_total += coord_recvcounts[r]; + } + + std::vector coord_recvbuf(coord_recv_total); + MPI_Alltoallv(node_coords_sendbuf.data(), coord_sendcounts.data(), coord_sdispls.data(), MPI_DOUBLE, + coord_recvbuf.data(), coord_recvcounts.data(), coord_rdispls.data(), MPI_DOUBLE, MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); + + // -------------- Phase 6: Build the final_mesh -------------- + final_mesh.initialize_nodes(num_new_nodes); + final_mesh.initialize_elems(num_new_elems, mesh.num_dims); + final_mesh.local_to_global_node_mapping = DCArrayKokkos(num_new_nodes); + final_mesh.local_to_global_elem_mapping = DCArrayKokkos(num_new_elems); + + // Fill global mappings + for (int i = 0; i < num_new_nodes; ++i) + final_mesh.local_to_global_node_mapping.host(i) = new_node_gids[i]; + for (int i = 0; i < num_new_elems; ++i) + final_mesh.local_to_global_elem_mapping.host(i) = new_elem_gids[i]; + + final_mesh.local_to_global_node_mapping.update_device(); + final_mesh.local_to_global_elem_mapping.update_device(); + + // // Rebuild nodes_in_elem + // for (int e = 0; e < num_new_elems; ++e) { + // for (int j = 0; j < nodes_per_elem; ++j) { + // int node_gid = conn_recvbuf[e * nodes_per_elem + j]; + // int node_lid = node_gid_to_lid[node_gid]; + // final_mesh.nodes_in_elem.host(e, j) = node_lid; + // } + // } + // final_mesh.nodes_in_elem.update_device(); + + + // rebuild the local element-node connectivity using the local node ids + for(int i = 0; i < num_new_elems; i++) { + for(int j = 0; j < nodes_per_elem; j++) { + + int node_gid = conn_recvbuf[i * nodes_per_elem + j]; + + int node_lid = -1; + + // Search through the local to global mapp to find the equivalent local index + for(int k = 0; k < num_new_nodes; k++){ + + if(node_gid == final_mesh.local_to_global_node_mapping.host(k)) { + node_lid = k; + break; + } + } + + final_mesh.nodes_in_elem.host(i, j) = node_lid; + } + } + + final_mesh.nodes_in_elem.update_device(); + + // Fill node coordinates + final_node.initialize(num_new_nodes, 3, {node_state::coords}); + for (int i = 0; i < num_new_nodes; ++i) { + final_node.coords.host(i, 0) = coord_recvbuf[i*3 + 0]; + final_node.coords.host(i, 1) = coord_recvbuf[i*3 + 1]; + final_node.coords.host(i, 2) = coord_recvbuf[i*3 + 2]; + } + final_node.coords.update_device(); + + // Connectivity rebuild + final_mesh.build_connectivity(); + MPI_Barrier(MPI_COMM_WORLD); + + for(int i = 0; i < world_size; i++) { + if(rank == i) { + print_rank_mesh_info(final_mesh, i); + } + MPI_Barrier(MPI_COMM_WORLD); + } + MPI_Barrier(MPI_COMM_WORLD); + write_vtk(final_mesh, final_node, rank); } // end MATAR scope MATAR_FINALIZE(); From f40c187695e166ab31b2fcea84a6c4c0d4683126 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Mon, 27 Oct 2025 17:03:33 -0500 Subject: [PATCH 08/52] ENH: It works --- examples/mesh_decomp/mesh_decomp.cpp | 31 ++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp index 3cd4c709..feccca61 100644 --- a/examples/mesh_decomp/mesh_decomp.cpp +++ b/examples/mesh_decomp/mesh_decomp.cpp @@ -112,7 +112,7 @@ int main(int argc, char** argv) { // Initial mesh size double origin[3] = {0.0, 0.0, 0.0}; double length[3] = {1.0, 1.0, 1.0}; - int num_elems_dim[3] = {2, 2, 2}; + int num_elems_dim[3] = {20, 20, 20}; Mesh_t initial_mesh; GaussPoint_t initial_GaussPoints; @@ -1181,11 +1181,34 @@ int main(int argc, char** argv) { final_mesh.nodes_in_elem.update_device(); // Fill node coordinates + // coord_recvbuf contains coords in element-node order, but we need them in node order + // Build a map from node GID to coordinates + std::map> node_gid_to_coords; + int coord_idx = 0; + for (int e = 0; e < num_new_elems; ++e) { + for (int j = 0; j < nodes_per_elem; ++j) { + int node_gid = conn_recvbuf[e * nodes_per_elem + j]; + if (node_gid_to_coords.find(node_gid) == node_gid_to_coords.end()) { + node_gid_to_coords[node_gid] = { + coord_recvbuf[coord_idx*3 + 0], + coord_recvbuf[coord_idx*3 + 1], + coord_recvbuf[coord_idx*3 + 2] + }; + } + coord_idx++; + } + } + + // Now fill coordinates in node order final_node.initialize(num_new_nodes, 3, {node_state::coords}); for (int i = 0; i < num_new_nodes; ++i) { - final_node.coords.host(i, 0) = coord_recvbuf[i*3 + 0]; - final_node.coords.host(i, 1) = coord_recvbuf[i*3 + 1]; - final_node.coords.host(i, 2) = coord_recvbuf[i*3 + 2]; + int node_gid = new_node_gids[i]; + auto it = node_gid_to_coords.find(node_gid); + if (it != node_gid_to_coords.end()) { + final_node.coords.host(i, 0) = it->second[0]; + final_node.coords.host(i, 1) = it->second[1]; + final_node.coords.host(i, 2) = it->second[2]; + } } final_node.coords.update_device(); From 35d0348d9619c0b83a18e1b762c327e6beea65e6 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Wed, 29 Oct 2025 09:54:47 -0500 Subject: [PATCH 09/52] ENH: Swapping to binary search and adding timers --- examples/mesh_decomp/mesh_decomp.cpp | 213 +++++++++++++++++++++------ 1 file changed, 166 insertions(+), 47 deletions(-) diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp index feccca61..d6144eaf 100644 --- a/examples/mesh_decomp/mesh_decomp.cpp +++ b/examples/mesh_decomp/mesh_decomp.cpp @@ -95,7 +95,7 @@ int main(int argc, char** argv) { Timer timer; timer.start(); - bool print_info = true; + bool print_info = false; bool print_vtk = false; @@ -112,7 +112,7 @@ int main(int argc, char** argv) { // Initial mesh size double origin[3] = {0.0, 0.0, 0.0}; double length[3] = {1.0, 1.0, 1.0}; - int num_elems_dim[3] = {20, 20, 20}; + int num_elems_dim[3] = {100, 100, 100}; Mesh_t initial_mesh; GaussPoint_t initial_GaussPoints; @@ -156,6 +156,8 @@ int main(int argc, char** argv) { // ******************************************************** // Build the initial mesh // ******************************************************** + double t_init_mesh_start = MPI_Wtime(); + if (rank == 0) { std::cout<<"World size: "<(nodes_set.begin(), nodes_set.end()); } + for (int i = 0; i < world_size; i++) { + nodes_per_rank[i] = nodes_to_send[i].size(); + } + if (print_info) { - for (int i = 0; i < world_size; i++) { - nodes_per_rank[i] = nodes_to_send[i].size(); - } + std::cout< node_pos_on_rank_flat(num_nodes_on_rank * 3); + // Timer for scattering node positions + double t_scatter_nodepos_start = MPI_Wtime(); + if(rank == 0) { for (int i = 0; i < world_size; i++) { @@ -437,6 +489,15 @@ int main(int argc, char** argv) { std::cout << std::endl; } + MPI_Barrier(MPI_COMM_WORLD); + + double t_scatter_nodepos_end = MPI_Wtime(); + if(rank == 0) { + std::cout<<" Finished scattering the node positions to each rank"< nodes_in_elem_on_rank(num_elements_on_rank * num_nodes_per_elem); + double t_scatter_elemnode_start = MPI_Wtime(); + if (rank == 0) { // Prepare element-node connectivity data for each rank std::vector all_nodes_in_elem; @@ -493,6 +556,13 @@ int main(int argc, char** argv) { MPI_Barrier(MPI_COMM_WORLD); + double t_scatter_elemnode_end = MPI_Wtime(); + if(rank == 0) { + std::cout << " Finished scattering the element-node connectivity data from the initial mesh to each rank" << std::endl; + std::cout << " Scattering element-node connectivity took " + << (t_scatter_elemnode_end - t_scatter_elemnode_start) << " seconds." << std::endl; + } + if (rank == 0 && print_info) { std::cout << "Rank " << rank << " received element-node connectivity (" @@ -508,6 +578,7 @@ int main(int argc, char** argv) { } MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" Finished scattering the element-node connectivity data from the initial mesh to each rank"< elems_in_elem_on_rank(total_elem_elem_entries); // Now scatter the num_elems_in_elem for each element on each rank @@ -580,6 +663,9 @@ int main(int argc, char** argv) { 0, MPI_COMM_WORLD); } + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" Finished scattering the actual element-element connectivity counts per element to each rank"< all_elems_in_elem; @@ -614,6 +700,9 @@ int main(int argc, char** argv) { MPI_Barrier(MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" Finished receiving the actual element-element connectivity entries to each rank"< new_elem_gids = recvbuf; @@ -1057,7 +1170,10 @@ int main(int argc, char** argv) { conn_sendcounts[r] = sendcounts[r] * nodes_per_elem; MPI_Alltoall(conn_sendcounts.data(), 1, MPI_INT, conn_recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" Finished exchanging element–node connectivity counts"< conn_sdispls(world_size), conn_rdispls(world_size); int conn_send_total = 0, conn_recv_total = 0; @@ -1073,7 +1189,7 @@ int main(int argc, char** argv) { conn_recvbuf.data(), conn_recvcounts.data(), conn_rdispls.data(), MPI_INT, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); - + if(rank == 0) std::cout<<" Finished exchanging element–node connectivity"< node_gid_set(conn_recvbuf.begin(), conn_recvbuf.end()); @@ -1115,6 +1231,7 @@ int main(int argc, char** argv) { MPI_Alltoall(coord_sendcounts.data(), 1, MPI_INT, coord_recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" Finished exchanging node coordinates counts"< coord_sdispls(world_size), coord_rdispls(world_size); int coord_send_total = 0, coord_recv_total = 0; @@ -1130,6 +1247,7 @@ int main(int argc, char** argv) { coord_recvbuf.data(), coord_recvcounts.data(), coord_rdispls.data(), MPI_DOUBLE, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" Finished exchanging node coordinates"< Date: Wed, 29 Oct 2025 12:07:45 -0500 Subject: [PATCH 10/52] ENH: Adding ghost elements WIP --- examples/mesh_decomp/mesh.h | 7 +- examples/mesh_decomp/mesh_decomp.cpp | 213 ++++++++++++++++++++++++++- examples/mesh_decomp/mesh_io.h | 33 +++-- 3 files changed, 236 insertions(+), 17 deletions(-) diff --git a/examples/mesh_decomp/mesh.h b/examples/mesh_decomp/mesh.h index 0011d2e8..92f3bcdf 100644 --- a/examples/mesh_decomp/mesh.h +++ b/examples/mesh_decomp/mesh.h @@ -308,16 +308,19 @@ struct Mesh_t // MPI Decomposition Data Definitions ---- // DCArrayKokkos local_to_global_node_mapping; ///< Local to global node mapping - DCArrayKokkos local_to_global_elem_mapping; ///< Local to global element mapping + + // Data structure for ghost elements required for MPI comms + size_t num_ghost_elems; ///< Number of ghost elements on this rank (from neighboring MPI ranks) + + // initialization methods void initialize_nodes(const size_t num_nodes_inp) { num_nodes = num_nodes_inp; - return; }; // end method diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp index d6144eaf..564dd3f1 100644 --- a/examples/mesh_decomp/mesh_decomp.cpp +++ b/examples/mesh_decomp/mesh_decomp.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include "mesh.h" @@ -112,7 +113,7 @@ int main(int argc, char** argv) { // Initial mesh size double origin[3] = {0.0, 0.0, 0.0}; double length[3] = {1.0, 1.0, 1.0}; - int num_elems_dim[3] = {100, 100, 100}; + int num_elems_dim[3] = {2, 2, 2}; Mesh_t initial_mesh; GaussPoint_t initial_GaussPoints; @@ -1069,6 +1070,7 @@ int main(int argc, char** argv) { * - Each local element's local index lid and global ID (gid) are listed with the * part to which PT-Scotch has assigned them. ***************************************************************************/ + print_info = true; for(int rank_id = 0; rank_id < world_size; rank_id++) { if(rank_id == rank && print_info) { for (size_t lid = 0; lid < mesh.num_elems; ++lid) { @@ -1080,6 +1082,7 @@ int main(int argc, char** argv) { } MPI_Barrier(MPI_COMM_WORLD); } + print_info = false; @@ -1335,6 +1338,214 @@ int main(int argc, char** argv) { final_mesh.build_connectivity(); MPI_Barrier(MPI_COMM_WORLD); + + +// ****************************************************************************************** +// Build the ghost elements +// ****************************************************************************************** + + double t_ghost_start = MPI_Wtime(); + + // Update host arrays for ghost detection + final_mesh.local_to_global_elem_mapping.update_host(); + final_mesh.local_to_global_node_mapping.update_host(); + final_mesh.nodes_in_elem.update_host(); + Kokkos::fence(); + + // Build a set of locally-owned element global IDs for fast lookup + std::set local_elem_gids; + for (int i = 0; i < num_new_elems; ++i) { + local_elem_gids.insert(final_mesh.local_to_global_elem_mapping.host(i)); + } + + // Exchange element GIDs with all ranks to know who owns what + // Collect all locally-owned element global IDs to send to other ranks + std::vector local_elem_gids_vec(local_elem_gids.begin(), local_elem_gids.end()); + + // First, gather the number of elements each rank owns + std::vector elem_counts(world_size); + int local_elem_count = static_cast(local_elem_gids_vec.size()); + + MPI_Allgather(&local_elem_count, 1, MPI_INT, elem_counts.data(), 1, MPI_INT, MPI_COMM_WORLD); + + // Compute displacements + std::vector elem_displs(world_size); + int total_elems = 0; + for (int r = 0; r < world_size; ++r) { + elem_displs[r] = total_elems; + total_elems += elem_counts[r]; + } + + // Gather all element GIDs from all ranks + std::vector all_elem_gids(total_elems); + MPI_Allgatherv(local_elem_gids_vec.data(), local_elem_count, MPI_UNSIGNED_LONG_LONG, + all_elem_gids.data(), elem_counts.data(), elem_displs.data(), + MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); + + // Build a map: element GID -> owning rank + std::map elem_gid_to_rank; + for (int r = 0; r < world_size; ++r) { + for (int i = 0; i < elem_counts[r]; ++i) { + size_t gid = all_elem_gids[elem_displs[r] + i]; + elem_gid_to_rank[gid] = r; + } + } + + // Strategy: Find ghost elements by checking neighbors of our boundary elements. + // A boundary element is one that has a neighbor owned by another rank. + // However, since build_connectivity() only includes locally-owned elements, + // we need to use a different approach: find elements on other ranks that share + // nodes with our locally-owned elements. + + // First, collect all nodes that belong to our locally-owned elements + std::set local_elem_nodes; + for (int lid = 0; lid < num_new_elems; ++lid) { + for (int j = 0; j < nodes_per_elem; ++j) { + size_t node_lid = final_mesh.nodes_in_elem.host(lid, j); + size_t node_gid = final_mesh.local_to_global_node_mapping.host(node_lid); + local_elem_nodes.insert(node_gid); + } + } + + // Now collect element-to-node connectivity to send to all ranks + // Format: for each element, list its node GIDs (each entry is a pair: elem_gid, node_gid) + std::vector elem_node_conn; + int local_conn_size = 0; + + for (int lid = 0; lid < num_new_elems; ++lid) { + size_t elem_gid = final_mesh.local_to_global_elem_mapping.host(lid); + for (int j = 0; j < nodes_per_elem; ++j) { + size_t node_lid = final_mesh.nodes_in_elem.host(lid, j); + size_t node_gid = final_mesh.local_to_global_node_mapping.host(node_lid); + elem_node_conn.push_back(elem_gid); + elem_node_conn.push_back(node_gid); + } + local_conn_size += nodes_per_elem * 2; // Each pair is 2 size_ts + } + + // Exchange element-node connectivity with all ranks using Allgather + // First, gather the sizes from each rank + std::vector conn_sizes(world_size); + MPI_Allgather(&local_conn_size, 1, MPI_INT, conn_sizes.data(), 1, MPI_INT, MPI_COMM_WORLD); + + // Compute displacements + std::vector conn_displs(world_size); + int total_conn = 0; + for (int r = 0; r < world_size; ++r) { + conn_displs[r] = total_conn; + total_conn += conn_sizes[r]; + } + + // Gather all element-node pairs from all ranks + std::vector all_conn(total_conn); + MPI_Allgatherv(elem_node_conn.data(), local_conn_size, MPI_UNSIGNED_LONG_LONG, + all_conn.data(), conn_sizes.data(), conn_displs.data(), + MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); + + // Build a map: node GID -> set of element GIDs that contain it (from other ranks) + std::map> node_to_ext_elem; + for (int r = 0; r < world_size; ++r) { + if (r == rank) continue; // Skip our own data + // Process pairs from rank r: conn_sizes[r] is in units of size_ts, so num_pairs = conn_sizes[r] / 2 + int num_pairs = conn_sizes[r] / 2; + for (int i = 0; i < num_pairs; ++i) { + // Each pair is 2 size_ts, starting at conn_displs[r] + int offset = conn_displs[r] + i * 2; + size_t elem_gid = all_conn[offset]; + size_t node_gid = all_conn[offset + 1]; + + // If this node is in one of our elements, then the element is a potential ghost + if (local_elem_nodes.find(node_gid) != local_elem_nodes.end()) { + // Check if this element is not owned by us + if (local_elem_gids.find(elem_gid) == local_elem_gids.end()) { + node_to_ext_elem[node_gid].insert(elem_gid); + } + } + } + } + + // Collect all unique ghost element GIDs + std::set ghost_elem_gids; + for (const auto& pair : node_to_ext_elem) { + for (size_t elem_gid : pair.second) { + ghost_elem_gids.insert(elem_gid); + } + } + + // Additional check: elements that are neighbors of our locally-owned elements + // but are owned by other ranks (these might already be in ghost_elem_gids, but check connectivity) + + for (int lid = 0; lid < num_new_elems; ++lid) { + size_t num_neighbors = final_mesh.num_elems_in_elem(lid); + + for (size_t nbr_idx = 0; nbr_idx < num_neighbors; ++nbr_idx) { + size_t neighbor_lid = final_mesh.elems_in_elem(lid, nbr_idx); + + if (neighbor_lid < static_cast(num_new_elems)) { + size_t neighbor_gid = final_mesh.local_to_global_elem_mapping(neighbor_lid); + + // Check if neighbor is owned by this rank + auto it = elem_gid_to_rank.find(neighbor_gid); + if (it != elem_gid_to_rank.end() && it->second != rank) { + // Neighbor is owned by another rank - it's a ghost for us + ghost_elem_gids.insert(neighbor_gid); + } + } + } + } + + // Count unique ghost elements + final_mesh.num_ghost_elems = ghost_elem_gids.size(); + + MPI_Barrier(MPI_COMM_WORLD); + double t_ghost_end = MPI_Wtime(); + + if (rank == 0) { + std::cout << " Finished calculating ghost elements" << std::endl; + std::cout << " Ghost element calculation took " << (t_ghost_end - t_ghost_start) << " seconds." << std::endl; + } + + // Print ghost element info if requested + print_info = true; + for(int i = 0; i < world_size; i++) { + if(rank == i && print_info) { + std::cout << "[rank " << rank << "] owns " << num_new_elems + << " elements and has " << final_mesh.num_ghost_elems << " ghost elements" << std::endl; + std::cout << "[rank " << rank << "] owned element global IDs: "; + for (int j = 0; j < num_new_elems; ++j) { + std::cout << final_mesh.local_to_global_elem_mapping(j) << " "; + } + std::cout << std::endl; + + + + std::cout << "[rank " << rank << "] ghost element GIDs: "; + for (size_t gid : ghost_elem_gids) { + std::cout << gid << " "; + } + std::cout << std::endl; + } + MPI_Barrier(MPI_COMM_WORLD); + } + + + + + MPI_Barrier(MPI_COMM_WORLD); + + + + + + + + + + + + + + for(int i = 0; i < world_size; i++) { if(rank == i && print_info) { print_rank_mesh_info(final_mesh, i); diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h index 95db8132..b044b599 100644 --- a/examples/mesh_decomp/mesh_io.h +++ b/examples/mesh_decomp/mesh_io.h @@ -231,16 +231,16 @@ void build_3d_box( ///////////////////////////////////////////////////////////////////////////// - /// - /// \fn write_vtk - /// - /// \brief Writes a vtk output file - /// - /// \param mesh mesh - /// \param node node data - /// \param rank rank - /// - ///////////////////////////////////////////////////////////////////////////// +/// +/// \fn write_vtk +/// +/// \brief Writes a vtk output file +/// +/// \param mesh mesh +/// \param node node data +/// \param rank rank +/// +///////////////////////////////////////////////////////////////////////////// void write_vtk(Mesh_t& mesh, node_t& node, int rank) @@ -276,7 +276,7 @@ void build_3d_box( Kokkos::fence(); - const int num_cell_scalar_vars = 1; + const int num_cell_scalar_vars = 2; const int num_cell_vec_vars = 0; const int num_cell_tensor_vars = 0; @@ -285,8 +285,8 @@ void build_3d_box( // Scalar values associated with a cell - const char cell_scalar_var_names[num_cell_scalar_vars][15] = { - "rank_id" + const char cell_scalar_var_names[num_cell_scalar_vars][30] = { + "rank_id", "elems_in_elem_owned" }; // const char cell_vec_var_names[num_cell_vec_vars][15] = { @@ -317,6 +317,11 @@ void build_3d_box( // export material centeric data to the elements elem_fields(0, 0) = rank; + for (size_t elem_gid = 0; elem_gid < mesh.num_elems; elem_gid++) { + elem_fields(elem_gid, 0) = rank; + elem_fields(elem_gid, 1) = (double)mesh.num_elems_in_elem(elem_gid); + } + // save the vertex vector fields to an array for exporting to graphics files CArray vec_fields(num_nodes, num_point_vec_vars, 3); @@ -454,7 +459,7 @@ void build_3d_box( fprintf(out[0], "SCALARS %s float 1\n", cell_scalar_var_names[var]); // the 1 is number of scalar components [1:4] fprintf(out[0], "LOOKUP_TABLE default\n"); for (size_t elem_gid = 0; elem_gid < mesh.num_elems; elem_gid++) { - fprintf(out[0], "%f\n", rank); + fprintf(out[0], "%f\n", elem_fields(elem_gid, var)); } // end for elem } // end for cell scalar_vars From 659d72da6f6f89bcf361ba60c850c7b9cea657c4 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Fri, 31 Oct 2025 10:53:44 -0500 Subject: [PATCH 11/52] ENH: Adding ghost elements and nodes --- examples/mesh_decomp/mesh_decomp.cpp | 669 +++++++++++++++++++++++---- examples/mesh_decomp/mesh_io.h | 6 +- 2 files changed, 574 insertions(+), 101 deletions(-) diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp index 564dd3f1..85f2b7f2 100644 --- a/examples/mesh_decomp/mesh_decomp.cpp +++ b/examples/mesh_decomp/mesh_decomp.cpp @@ -16,86 +16,54 @@ #include "scotch.h" #include "ptscotch.h" -// Timer class for timing the execution of the matrix multiplication -class Timer { - private: - std::chrono::high_resolution_clock::time_point start_time; - std::chrono::high_resolution_clock::time_point end_time; - bool is_running; - - public: - Timer() : is_running(false) {} - - void start() { - start_time = std::chrono::high_resolution_clock::now(); - is_running = true; - } - - double stop() { - if (!is_running) { - std::cerr << "Timer was not running!" << std::endl; - return 0.0; - } - end_time = std::chrono::high_resolution_clock::now(); - is_running = false; - - auto duration = std::chrono::duration_cast(end_time - start_time); - return duration.count() / 1000.0; // Convert to milliseconds - } -}; -void print_rank_mesh_info(Mesh_t& mesh, int rank) { +void calc_elements_per_rank(std::vector& elems_per_rank, int num_elems, int world_size){ + // Compute elements to send to each rank; handle remainders for non-even distribution + std::fill(elems_per_rank.begin(), elems_per_rank.end(), num_elems / world_size); + int remainder = num_elems % world_size; + for (int i = 0; i < remainder; ++i) { + elems_per_rank[i] += 1; + } +} - std::cout<& elems_per_rank, int num_elems, int world_size){ - // Compute elements to send to each rank; handle remainders for non-even distribution - std::fill(elems_per_rank.begin(), elems_per_rank.end(), num_elems / world_size); - int remainder = num_elems % world_size; - for (int i = 0; i < remainder; ++i) { - elems_per_rank[i] += 1; - } -} +void print_rank_mesh_info(Mesh_t& mesh, int rank) { -void print_mesh_info(Mesh_t& mesh){ + std::cout<(neighbor_gid)); ++offset; // Increment running edge count @@ -1036,15 +1010,84 @@ int main(int argc, char** argv) { * Step 6: Partition (repartition) the mesh using PT-Scotch * - Each vertex (mesh element) will be assigned a part (mesh chunk). * - Arch is initialized for a complete graph of world_size parts (one per rank). - * - Loki **************************************************************/ + // SCOTCH_Arch controls the "architecture" for partitioning: the topology + // (number and connectivity of parts) to which the graph will be mapped. + // The archdat variable encodes this. Below are common options: + // + // - SCOTCH_archCmplt(&archdat, nbparts) + // * Creates a "complete graph" architecture with nbparts nodes (fully connected). + // Every part is equally distant from every other part. + // This is typically used when minimizing only *balance* and *edge cut*, + // not considering any underlying machine topology. + // + // - SCOTCH_archHcub(&archdat, dimension) + // * Hypercube architecture (rare in modern use). + // Sets up a hypercube of given dimension. + // + // - SCOTCH_archTleaf / SCOTCH_archTleafX + // * Tree architectures, for hierarchically structured architectures. + // + // - SCOTCH_archMesh2 / SCOTCH_archMesh3 + // * 2D or 3D mesh topology architectures (useful for grid/matrix machines). + // + // - SCOTCH_archBuild + // * General: builds any architecture from a descriptor string. + // + // For distributed mesh partitioning to MPI ranks (where all ranks are equal), + // the most common and appropriate is "complete graph" (Cmplt): each part (rank) + // is equally reachable from any other (no communication topology bias). SCOTCH_Arch archdat; // PT-Scotch architecture structure: describes desired partition topology SCOTCH_archInit(&archdat); - SCOTCH_archCmplt(&archdat, static_cast(world_size)); // Partition into world_size complete nodes + // Partition into 'world_size' equally connected parts (each MPI rank is a "node") + // Other topology options could be substituted above according to your needs (see docs). + SCOTCH_archCmplt(&archdat, static_cast(world_size)); + + + + // ===================== PT-Scotch Strategy Selection and Documentation ====================== + // The PT-Scotch "strategy" (stratdat here) controls the algorithms and heuristics used for partitioning. + // You can specify a string or build a strategy using functions that adjust speed, quality, and recursion. + // + // Common strategy flags (see "scotch.h", "ptscotch.h", and PT-Scotch documentation): + // + // - SCOTCH_STRATDEFAULT: Use the default (fast, reasonable quality) partitioning strategy. + // Useful for quick, generic partitions where quality is not critical. + // + // - SCOTCH_STRATSPEED: Aggressively maximizes speed (at the cost of cut quality). + // For large runs or test runs where speed is more important than minimizing edgecut. + // + // - SCOTCH_STRATQUALITY: Prioritizes partition *quality* (minimizing edge cuts, maximizing load balance). + // Slower than the default. Use when high-quality partitioning is desired. + // + // - SCOTCH_STRATBALANCE: Tradeoff between speed and quality for balanced workload across partitions. + // Use if load balance is more critical than cut size. + // + // Additional Options: + // - Strategy can also be specified as a string (see Scotch manual, e.g., "b{sep=m{...} ...}"). + // - Recursion count parameter (here, set to 0) controls strategy recursion depth (0 = automatic). + // - Imbalance ratio (here, 0.01) allows minor imbalance in part weight for better cut quality. + // + // Example usage: + // SCOTCH_stratDgraphMapBuild(&strat, SCOTCH_STRATQUALITY, nparts, 0, 0.01); + // ^ quality-focused, nparts=number of parts/ranks + // SCOTCH_stratDgraphMapBuild(&strat, SCOTCH_STRATSPEED, nparts, 0, 0.05); + // ^ speed-focused, allow 5% imbalance + // + // Reference: + // - https://gitlab.inria.fr/scotch/scotch/-/blob/master/doc/libptscotch.pdf + // - SCOTCH_stratDgraphMapBuild() and related "strategy" documentation. + // + // --------------- Set up the desired partitioning strategy here: --------------- SCOTCH_Strat stratdat; // PT-Scotch strategy object: holds partitioning options/settings SCOTCH_stratInit(&stratdat); - SCOTCH_stratDgraphMapBuild(&stratdat, SCOTCH_STRATQUALITY, world_size, 0, 0.01); // zero is recursion count, 0=automatic + + // Select partitioning strategy for this run: + // Use SCOTCH_STRATQUALITY for best cut quality. + // To change: replace with SCOTCH_STRATDEFAULT, SCOTCH_STRATSPEED, or SCOTCH_STRATBALANCE as discussed above. + // Arguments: (strategy object, strategy flag, #parts, recursion (0=auto), imbalance ratio) + SCOTCH_stratDgraphMapBuild(&stratdat, SCOTCH_STRATQUALITY, world_size, 0, 0.01); // partloctab: output array mapping each local element (vertex) to a *target partition number* // After partitioning, partloctab[i] gives the part-assignment (in [0,world_size-1]) for local element i. @@ -1070,7 +1113,7 @@ int main(int argc, char** argv) { * - Each local element's local index lid and global ID (gid) are listed with the * part to which PT-Scotch has assigned them. ***************************************************************************/ - print_info = true; + print_info = false; for(int rank_id = 0; rank_id < world_size; rank_id++) { if(rank_id == rank && print_info) { for (size_t lid = 0; lid < mesh.num_elems; ++lid) { @@ -1159,7 +1202,7 @@ int main(int argc, char** argv) { for (int i = 0; i < mesh.num_elems; ++i) if (mesh.local_to_global_elem_mapping.host(i) == gid) { lid = i; break; } - for (int j = 0; j < nodes_per_elem; ++j) { + for (int j = 0; j < nodes_per_elem; j++) { int node_lid = mesh.nodes_in_elem.host(lid, j); int node_gid = mesh.local_to_global_node_mapping.host(node_lid); conn_sendbuf.push_back(node_gid); @@ -1216,7 +1259,7 @@ int main(int argc, char** argv) { for (int i = 0; i < mesh.num_elems; ++i) if (mesh.local_to_global_elem_mapping.host(i) == gid) { lid = i; break; } - for (int j = 0; j < nodes_per_elem; ++j) { + for (int j = 0; j < nodes_per_elem; j++) { int node_lid = mesh.nodes_in_elem.host(lid, j); int node_gid = mesh.local_to_global_node_mapping.host(node_lid); @@ -1308,7 +1351,7 @@ int main(int argc, char** argv) { std::map> node_gid_to_coords; int coord_idx = 0; for (int e = 0; e < num_new_elems; ++e) { - for (int j = 0; j < nodes_per_elem; ++j) { + for (int j = 0; j < nodes_per_elem; j++) { int node_gid = conn_recvbuf[e * nodes_per_elem + j]; if (node_gid_to_coords.find(node_gid) == node_gid_to_coords.end()) { node_gid_to_coords[node_gid] = { @@ -1346,28 +1389,20 @@ int main(int argc, char** argv) { double t_ghost_start = MPI_Wtime(); - // Update host arrays for ghost detection - final_mesh.local_to_global_elem_mapping.update_host(); - final_mesh.local_to_global_node_mapping.update_host(); - final_mesh.nodes_in_elem.update_host(); - Kokkos::fence(); - - // Build a set of locally-owned element global IDs for fast lookup - std::set local_elem_gids; - for (int i = 0; i < num_new_elems; ++i) { - local_elem_gids.insert(final_mesh.local_to_global_elem_mapping.host(i)); - } - - // Exchange element GIDs with all ranks to know who owns what - // Collect all locally-owned element global IDs to send to other ranks - std::vector local_elem_gids_vec(local_elem_gids.begin(), local_elem_gids.end()); - // First, gather the number of elements each rank owns std::vector elem_counts(world_size); - int local_elem_count = static_cast(local_elem_gids_vec.size()); - - MPI_Allgather(&local_elem_count, 1, MPI_INT, elem_counts.data(), 1, MPI_INT, MPI_COMM_WORLD); - + + // int MPI_Allgather( + // const void* sendbuf, // Data to send from this process + // int sendcount, // Number of elements to send + // MPI_Datatype sendtype, // Type of send data + // void* recvbuf, // Buffer to receive all data + // int recvcount, // Number of elements to receive from each process + // MPI_Datatype recvtype, // Type of receive data + // MPI_Comm comm // Communicator + // ); + MPI_Allgather(&final_mesh.num_elems, 1, MPI_INT, elem_counts.data(), 1, MPI_INT, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); // Compute displacements std::vector elem_displs(world_size); int total_elems = 0; @@ -1378,10 +1413,21 @@ int main(int argc, char** argv) { // Gather all element GIDs from all ranks std::vector all_elem_gids(total_elems); - MPI_Allgatherv(local_elem_gids_vec.data(), local_elem_count, MPI_UNSIGNED_LONG_LONG, + + // int MPI_Allgatherv( + // const void* sendbuf, // Data to send from this process + // int sendcount, // Number of elements THIS process sends + // MPI_Datatype sendtype, // Type of send data + // void* recvbuf, // Buffer to receive all data + // const int* recvcounts, // Array: number of elements from each process + // const int* displs, // Array: displacement for each process's data + // MPI_Datatype recvtype, // Type of receive data + // MPI_Comm comm // Communicator + // ); + MPI_Allgatherv(final_mesh.local_to_global_elem_mapping.host_pointer(), final_mesh.num_elems, MPI_UNSIGNED_LONG_LONG, all_elem_gids.data(), elem_counts.data(), elem_displs.data(), MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); - + MPI_Barrier(MPI_COMM_WORLD); // Build a map: element GID -> owning rank std::map elem_gid_to_rank; for (int r = 0; r < world_size; ++r) { @@ -1400,7 +1446,7 @@ int main(int argc, char** argv) { // First, collect all nodes that belong to our locally-owned elements std::set local_elem_nodes; for (int lid = 0; lid < num_new_elems; ++lid) { - for (int j = 0; j < nodes_per_elem; ++j) { + for (int j = 0; j < nodes_per_elem; j++) { size_t node_lid = final_mesh.nodes_in_elem.host(lid, j); size_t node_gid = final_mesh.local_to_global_node_mapping.host(node_lid); local_elem_nodes.insert(node_gid); @@ -1414,7 +1460,7 @@ int main(int argc, char** argv) { for (int lid = 0; lid < num_new_elems; ++lid) { size_t elem_gid = final_mesh.local_to_global_elem_mapping.host(lid); - for (int j = 0; j < nodes_per_elem; ++j) { + for (int j = 0; j < nodes_per_elem; j++) { size_t node_lid = final_mesh.nodes_in_elem.host(lid, j); size_t node_gid = final_mesh.local_to_global_node_mapping.host(node_lid); elem_node_conn.push_back(elem_gid); @@ -1427,7 +1473,7 @@ int main(int argc, char** argv) { // First, gather the sizes from each rank std::vector conn_sizes(world_size); MPI_Allgather(&local_conn_size, 1, MPI_INT, conn_sizes.data(), 1, MPI_INT, MPI_COMM_WORLD); - + MPI_Barrier(MPI_COMM_WORLD); // Compute displacements std::vector conn_displs(world_size); int total_conn = 0; @@ -1441,6 +1487,12 @@ int main(int argc, char** argv) { MPI_Allgatherv(elem_node_conn.data(), local_conn_size, MPI_UNSIGNED_LONG_LONG, all_conn.data(), conn_sizes.data(), conn_displs.data(), MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + // create a set for local_elem_gids + std::set local_elem_gids; + for (int i = 0; i < num_new_elems; ++i) { + local_elem_gids.insert(final_mesh.local_to_global_elem_mapping.host(i)); + } // Build a map: node GID -> set of element GIDs that contain it (from other ranks) std::map> node_to_ext_elem; @@ -1506,35 +1558,448 @@ int main(int argc, char** argv) { } // Print ghost element info if requested - print_info = true; + print_info = false; for(int i = 0; i < world_size; i++) { + MPI_Barrier(MPI_COMM_WORLD); if(rank == i && print_info) { std::cout << "[rank " << rank << "] owns " << num_new_elems << " elements and has " << final_mesh.num_ghost_elems << " ghost elements" << std::endl; std::cout << "[rank " << rank << "] owned element global IDs: "; - for (int j = 0; j < num_new_elems; ++j) { + for (int j = 0; j < final_mesh.num_elems; j++) { std::cout << final_mesh.local_to_global_elem_mapping(j) << " "; } - std::cout << std::endl; - - - - std::cout << "[rank " << rank << "] ghost element GIDs: "; - for (size_t gid : ghost_elem_gids) { + + // Print global IDs of ghost elements + std::cout << std::endl << "[rank " << rank << "] ghost element global IDs: "; + for (const auto& gid : ghost_elem_gids) { std::cout << gid << " "; } std::cout << std::endl; } + + MPI_Barrier(MPI_COMM_WORLD); + } + + + + // Build the connectivity that includes ghost elements + // Create an extended mesh with owned elements first, then ghost elements appended + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout << " Starting to build extended mesh with ghost elements" << std::endl; + + // Step 1: Extract ghost element-node connectivity from all_conn + // Build a map: ghost_elem_gid -> vector of node_gids (ordered as in all_conn) + std::map> ghost_elem_to_nodes; + for (const size_t& ghost_gid : ghost_elem_gids) { + ghost_elem_to_nodes[ghost_gid].reserve(nodes_per_elem); + } + + // Extract nodes for each ghost element from all_conn + // The all_conn array has pairs (elem_gid, node_gid) for each rank's elements + for (int r = 0; r < world_size; ++r) { + if (r == rank) continue; // Skip our own data (we already have owned element connectivity) + int num_pairs = conn_sizes[r] / 2; + + // Process pairs in order - each element's nodes are contiguous + for (int i = 0; i < num_pairs; ++i) { + int offset = conn_displs[r] + i * 2; + size_t elem_gid = all_conn[offset]; + size_t node_gid = all_conn[offset + 1]; + + // If this is one of our ghost elements, record its node (in order) + auto it = ghost_elem_to_nodes.find(elem_gid); + if (it != ghost_elem_to_nodes.end()) { + it->second.push_back(node_gid); + } + } + } + + // Verify each ghost element has the correct number of nodes + for (auto& pair : ghost_elem_to_nodes) { + if (pair.second.size() != static_cast(nodes_per_elem)) { + std::cerr << "[rank " << rank << "] ERROR: Ghost element " << pair.first + << " has " << pair.second.size() << " nodes, expected " << nodes_per_elem << std::endl; + } + } + + // Step 2: Build extended node list (owned nodes first, then ghost-only nodes) + // Start with owned nodes + std::map node_gid_to_extended_lid; + int extended_node_lid = 0; + + // Add all owned nodes + for (int i = 0; i < final_mesh.num_nodes; ++i) { + size_t node_gid = final_mesh.local_to_global_node_mapping.host(i); + node_gid_to_extended_lid[node_gid] = extended_node_lid++; + } + + // Add ghost-only nodes (nodes that belong to ghost elements but not to owned elements) + std::set ghost_only_nodes; + for (const auto& pair : ghost_elem_to_nodes) { + for (size_t node_gid : pair.second) { + // Check if we already have this node + if (node_gid_to_extended_lid.find(node_gid) == node_gid_to_extended_lid.end()) { + ghost_only_nodes.insert(node_gid); + } + } + } + + // Assign extended local IDs to ghost-only nodes + for (size_t node_gid : ghost_only_nodes) { + node_gid_to_extended_lid[node_gid] = extended_node_lid++; + } + + int total_extended_nodes = extended_node_lid; + + // Step 3: Prepare requests for ghost node coordinates from owning ranks (if needed later) + // Build request list: for each ghost node, find an owning rank via any ghost element that contains it + std::map> rank_to_ghost_node_requests; + for (size_t node_gid : ghost_only_nodes) { + // Find which rank owns an element containing this node + // Look through ghost elements + for (const auto& pair : ghost_elem_to_nodes) { + size_t ghost_elem_gid = pair.first; + const std::vector& nodes = pair.second; + bool found = false; + for (size_t ngid : nodes) { + if (ngid == node_gid) { + found = true; + break; + } + } + if (found) { + auto owner_it = elem_gid_to_rank.find(ghost_elem_gid); + if (owner_it != elem_gid_to_rank.end()) { + rank_to_ghost_node_requests[owner_it->second].push_back(node_gid); + break; + } + } + } + } + + // Step 4: Build extended element list and node connectivity + // Owned elements: 0 to num_new_elems-1 (already have these) + // Ghost elements: num_new_elems to num_new_elems + num_ghost_elems - 1 + + // Create extended element-node connectivity array + int total_extended_elems = final_mesh.num_elems + final_mesh.num_ghost_elems; + std::vector> extended_nodes_in_elem(total_extended_elems); + + // Copy owned element connectivity (convert to extended node LIDs) + for (int lid = 0; lid < final_mesh.num_elems; ++lid) { + extended_nodes_in_elem[lid].reserve(nodes_per_elem); + for (int j = 0; j < nodes_per_elem; j++) { + size_t node_lid = final_mesh.nodes_in_elem.host(lid, j); + size_t node_gid = final_mesh.local_to_global_node_mapping.host(node_lid); + int ext_lid = node_gid_to_extended_lid[node_gid]; + extended_nodes_in_elem[lid].push_back(ext_lid); + } + } + + // Add ghost element connectivity (map ghost node GIDs to extended node LIDs) + int ghost_elem_ext_lid = final_mesh.num_elems; + std::vector ghost_elem_gids_ordered(ghost_elem_gids.begin(), ghost_elem_gids.end()); + std::sort(ghost_elem_gids_ordered.begin(), ghost_elem_gids_ordered.end()); + + for (size_t ghost_gid : ghost_elem_gids_ordered) { + auto it = ghost_elem_to_nodes.find(ghost_gid); + if (it == ghost_elem_to_nodes.end()) continue; + + extended_nodes_in_elem[ghost_elem_ext_lid].reserve(nodes_per_elem); + for (size_t node_gid : it->second) { + int ext_lid = node_gid_to_extended_lid[node_gid]; + extended_nodes_in_elem[ghost_elem_ext_lid].push_back(ext_lid); + } + ghost_elem_ext_lid++; + } + + MPI_Barrier(MPI_COMM_WORLD); + // Sequential rank-wise printing of extended mesh structure info + for (int r = 0; r < world_size; ++r) { + if (rank == r) { + std::cout << "[rank " << rank << "] Finished building extended mesh structure" << std::endl; + std::cout << "[rank " << rank << "] - Owned elements: " << final_mesh.num_elems << std::endl; + std::cout << "[rank " << rank << "] - Ghost elements: " << final_mesh.num_ghost_elems << std::endl; + std::cout << "[rank " << rank << "] - Total extended elements: " << total_extended_elems << std::endl; + std::cout << "[rank " << rank << "] - Owned nodes: " << final_mesh.num_nodes << std::endl; + std::cout << "[rank " << rank << "] - Ghost-only nodes: " << ghost_only_nodes.size() << std::endl; + std::cout << "[rank " << rank << "] - Total extended nodes: " << total_extended_nodes << std::endl; + std::cout << std::flush; + } MPI_Barrier(MPI_COMM_WORLD); } + + // The extended_nodes_in_elem vector now contains the connectivity for both owned and ghost elements + // Each element's nodes are stored using extended local node IDs (0-based, contiguous) + + // Build reverse maps: extended_lid -> gid for nodes and elements + std::vector extended_lid_to_node_gid(total_extended_nodes); + for (const auto& pair : node_gid_to_extended_lid) { + extended_lid_to_node_gid[pair.second] = pair.first; + } + + // Build extended element GID list: owned first, then ghost + std::vector extended_lid_to_elem_gid(total_extended_elems); + // Owned elements + for (int i = 0; i < final_mesh.num_elems; ++i) { + extended_lid_to_elem_gid[i] = final_mesh.local_to_global_elem_mapping.host(i); + } + // Ghost elements (in sorted order) + for (size_t idx = 0; idx < ghost_elem_gids_ordered.size(); ++idx) { + extended_lid_to_elem_gid[final_mesh.num_elems + idx] = ghost_elem_gids_ordered[idx]; + } + + mesh_with_ghosts.initialize_nodes(total_extended_nodes); + mesh_with_ghosts.initialize_elems(total_extended_elems, 3); + mesh_with_ghosts.local_to_global_node_mapping = DCArrayKokkos(total_extended_nodes); + mesh_with_ghosts.local_to_global_elem_mapping = DCArrayKokkos(total_extended_elems); + for (int i = 0; i < total_extended_nodes; i++) { + mesh_with_ghosts.local_to_global_node_mapping.host(i) = extended_lid_to_node_gid[i]; + } + for (int i = 0; i < total_extended_elems; i++) { + mesh_with_ghosts.local_to_global_elem_mapping.host(i) = extended_lid_to_elem_gid[i]; + } + mesh_with_ghosts.local_to_global_node_mapping.update_device(); + mesh_with_ghosts.local_to_global_elem_mapping.update_device(); + + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" Starting reverse mapping of the element-node connectivity from the global node ids to the local node ids"< coordinate. + // 3. Use this map to fill node_with_ghosts.coords. + + // 1. Build list of all global node IDs needed on this rank (owned + ghosts) + std::vector all_needed_node_gids(total_extended_nodes); + for (int i = 0; i < total_extended_nodes; ++i) { + all_needed_node_gids[i] = mesh_with_ghosts.local_to_global_node_mapping.host(i); + } + + // 2. Build owned node GIDs and their coordinates + std::vector owned_gids(final_mesh.num_nodes); + for (int i = 0; i < owned_gids.size(); ++i) + owned_gids[i] = final_mesh.local_to_global_node_mapping.host(i); + + // 3. Gather all GIDs in the world that are needed anywhere (owned or ghosted, by any rank) + // so we can distribute the needed coordinate data. + // The easiest is to Allgather everyone's "owned_gids" and coords + + int local_owned_count = static_cast(owned_gids.size()); + std::vector owned_counts(world_size, 0); + if (local_owned_count < 0) local_owned_count = 0; // Clean up possibility of -1 + + // a) Gather counts + owned_counts.resize(world_size, 0); + MPI_Allgather(&local_owned_count, 1, MPI_INT, owned_counts.data(), 1, MPI_INT, MPI_COMM_WORLD); + + // b) Displacements and total + std::vector owned_displs(world_size,0); + int total_owned = 0; + for (int r=0; r all_owned_gids(total_owned); + MPI_Allgatherv(owned_gids.data(), local_owned_count, MPI_UNSIGNED_LONG_LONG, + all_owned_gids.data(), owned_counts.data(), owned_displs.data(), + MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); + + // d) Global coords (size: total_owned x 3) + std::vector owned_coords_send(3*local_owned_count, 0.0); + for (int i=0; i all_owned_coords(3 * total_owned, 0.0); + + // Create coordinate-specific counts and displacements (in units of doubles, not nodes) + std::vector coord_counts(world_size); + std::vector coord_displs(world_size); + for (int r=0; r coord[3] + std::unordered_map> gid_to_coord; + for (int i=0; i xyz = { + all_owned_coords[3*i+0], + all_owned_coords[3*i+1], + all_owned_coords[3*i+2] + }; + gid_to_coord[all_owned_gids[i]] = xyz; + } + + // 4. Finally, fill node_with_ghosts.coords with correct coordinates. + for (int i = 0; i < total_extended_nodes; ++i) { + size_t gid = mesh_with_ghosts.local_to_global_node_mapping.host(i); + auto it = gid_to_coord.find(gid); + if (it != gid_to_coord.end()) { + node_with_ghosts.coords.host(i,0) = it->second[0]; + node_with_ghosts.coords.host(i,1) = it->second[1]; + node_with_ghosts.coords.host(i,2) = it->second[2]; + } else { + // Could happen if there's a bug: fill with zeros for safety + node_with_ghosts.coords.host(i,0) = 0.0; + node_with_ghosts.coords.host(i,1) = 0.0; + node_with_ghosts.coords.host(i,2) = 0.0; + } + } + node_with_ghosts.coords.update_device(); + + + + + // -------------------------------------------------------------------------------------- + // Build reverse map via global IDs: for each local element gid, find ranks that ghost it. + // Steps: + // 1) Each rank contributes its ghost element GIDs. + // 2) Allgatherv ghost GIDs to build gid -> [ranks that ghost it]. + // 3) For each locally-owned element gid, lookup ranks that ghost it and record targets. + // -------------------------------------------------------------------------------------- + std::vector>> boundary_elem_targets(final_mesh.num_elems); + + // Prepare local ghost list as vector + std::vector ghost_gids_vec; + ghost_gids_vec.reserve(ghost_elem_gids.size()); + for (const auto &g : ghost_elem_gids) ghost_gids_vec.push_back(g); + + // Exchange counts + std::vector ghost_counts(world_size, 0); + int local_ghost_count = static_cast(ghost_gids_vec.size()); + MPI_Allgather(&local_ghost_count, 1, MPI_INT, ghost_counts.data(), 1, MPI_INT, MPI_COMM_WORLD); + + // Displacements and recv buffer + std::vector ghost_displs(world_size, 0); + int total_ghosts = 0; + for (int r = 0; r < world_size; ++r) { + ghost_displs[r] = total_ghosts; + total_ghosts += ghost_counts[r]; + } + std::vector all_ghost_gids(total_ghosts); + + // Gather ghost gids + MPI_Allgatherv(ghost_gids_vec.data(), local_ghost_count, MPI_UNSIGNED_LONG_LONG, + all_ghost_gids.data(), ghost_counts.data(), ghost_displs.data(), + MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout << " Finished gathering ghost element GIDs" << std::endl; + + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout << " Starting to build the reverse map for communication" << std::endl; + // Build map gid -> ranks that ghost it + std::unordered_map> gid_to_ghosting_ranks; + gid_to_ghosting_ranks.reserve(static_cast(total_ghosts)); + for (int r = 0; r < world_size; ++r) { + int cnt = ghost_counts[r]; + int off = ghost_displs[r]; + for (int i = 0; i < cnt; ++i) { + size_t g = all_ghost_gids[off + i]; + gid_to_ghosting_ranks[g].push_back(r); + } + } + + // For each local element, list destinations: ranks that ghost our gid + for (int elem_lid = 0; elem_lid < final_mesh.num_elems; elem_lid++) { + size_t local_elem_gid = final_mesh.local_to_global_elem_mapping.host(elem_lid); + auto it = gid_to_ghosting_ranks.find(local_elem_gid); + if (it == gid_to_ghosting_ranks.end()) continue; + const std::vector &dest_ranks = it->second; + for (int rr : dest_ranks) { + if (rr == rank) continue; + boundary_elem_targets[elem_lid].push_back(std::make_pair(rr, local_elem_gid)); + } + } + + std::cout.flush(); + MPI_Barrier(MPI_COMM_WORLD); + // Optional: print a compact summary of reverse map for verification (limited output) + for(int i = 0; i < world_size; i++) { + if (rank == i && print_info) { + std::cout << std::endl; + for (int elem_lid = 0; elem_lid < final_mesh.num_elems; elem_lid++) { + + size_t local_elem_gid = final_mesh.local_to_global_elem_mapping.host(elem_lid); + if (boundary_elem_targets[elem_lid].empty()) + { + std::cout << "[rank " << rank << "] " << "elem_lid: "<< elem_lid <<" - elem_gid: " << local_elem_gid << " sends to: no ghost elements" << std::endl; + } + else + { + std::cout << "[rank " << rank << "] " << "elem_lid: "<< elem_lid <<" - elem_gid: " << local_elem_gid << " sends to: "; + int shown = 0; + for (const auto &pr : boundary_elem_targets[elem_lid]) { + if (shown >= 12) { std::cout << " ..."; break; } + std::cout << "(r" << pr.first << ":gid " << pr.second << ") "; + shown++; + } + std::cout << std::endl; + } + } + std::cout.flush(); + } + MPI_Barrier(MPI_COMM_WORLD); + } + + print_info = false; MPI_Barrier(MPI_COMM_WORLD); +// NOTES: +// We need to create communication maps for nodes, specifically an index list of +// -- Owned (nodes unique to this rank) +// -- Shared (nodes on the boundary of this rank) +// -- Ghost (nodes on the boundary of this rank that are owned by other ranks) + + +// What we currently have is a communication plan for elements, eg. Each shared element (element on an MPI boundary) knows which rank and associated element global id on that rank it is connected to. + + + + + @@ -1555,16 +2020,24 @@ int main(int argc, char** argv) { MPI_Barrier(MPI_COMM_WORLD); - write_vtk(final_mesh, final_node, rank); + // write_vtk(final_mesh, final_node, rank); + write_vtk(mesh_with_ghosts, node_with_ghosts, rank); + + + MPI_Barrier(MPI_COMM_WORLD); + + // Stop timer and get execution time + double t_main_end = MPI_Wtime(); + + if(rank == 0) { + printf("Total execution time: %.2f seconds\n", t_main_end - t_main_start); + } } // end MATAR scope MATAR_FINALIZE(); MPI_Finalize(); - // Stop timer and get execution time - double time_ms = timer.stop(); - - printf("Execution time: %.2f ms\n", time_ms); + return 0; } \ No newline at end of file diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h index b044b599..2a704e14 100644 --- a/examples/mesh_decomp/mesh_io.h +++ b/examples/mesh_decomp/mesh_io.h @@ -276,7 +276,7 @@ void build_3d_box( Kokkos::fence(); - const int num_cell_scalar_vars = 2; + const int num_cell_scalar_vars = 3; const int num_cell_vec_vars = 0; const int num_cell_tensor_vars = 0; @@ -286,7 +286,7 @@ void build_3d_box( // Scalar values associated with a cell const char cell_scalar_var_names[num_cell_scalar_vars][30] = { - "rank_id", "elems_in_elem_owned" + "rank_id", "elems_in_elem_owned", "global_elem_id" }; // const char cell_vec_var_names[num_cell_vec_vars][15] = { @@ -315,11 +315,11 @@ void build_3d_box( // export material centeric data to the elements - elem_fields(0, 0) = rank; for (size_t elem_gid = 0; elem_gid < mesh.num_elems; elem_gid++) { elem_fields(elem_gid, 0) = rank; elem_fields(elem_gid, 1) = (double)mesh.num_elems_in_elem(elem_gid); + elem_fields(elem_gid, 2) = mesh.local_to_global_elem_mapping.host(elem_gid); } From 717271f99a6601912cc30c9516ffc12a828ddd3f Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Fri, 31 Oct 2025 14:38:58 -0500 Subject: [PATCH 12/52] ENH: Adding vtu output, and tidy up --- examples/mesh_decomp/mesh.h | 5 +- examples/mesh_decomp/mesh_decomp.cpp | 29 ++- examples/mesh_decomp/mesh_io.h | 309 +++++++++++++++++++++++++-- 3 files changed, 320 insertions(+), 23 deletions(-) diff --git a/examples/mesh_decomp/mesh.h b/examples/mesh_decomp/mesh.h index 92f3bcdf..a745e17e 100644 --- a/examples/mesh_decomp/mesh.h +++ b/examples/mesh_decomp/mesh.h @@ -310,9 +310,12 @@ struct Mesh_t DCArrayKokkos local_to_global_node_mapping; ///< Local to global node mapping DCArrayKokkos local_to_global_elem_mapping; ///< Local to global element mapping - // Data structure for ghost elements required for MPI comms + size_t num_owned_elems; ///< Number of owned elements on this rank size_t num_ghost_elems; ///< Number of ghost elements on this rank (from neighboring MPI ranks) + size_t num_owned_nodes; ///< Number of owned nodes on this rank + size_t num_ghost_nodes; ///< Number of ghost nodes on this rank (from neighboring MPI ranks) + diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp index 85f2b7f2..45ffc2f5 100644 --- a/examples/mesh_decomp/mesh_decomp.cpp +++ b/examples/mesh_decomp/mesh_decomp.cpp @@ -84,7 +84,7 @@ int main(int argc, char** argv) { // Initial mesh size double origin[3] = {0.0, 0.0, 0.0}; double length[3] = {1.0, 1.0, 1.0}; - int num_elems_dim[3] = {4, 4, 1}; + int num_elems_dim[3] = {25, 25, 25}; Mesh_t initial_mesh; GaussPoint_t initial_GaussPoints; @@ -1720,7 +1720,7 @@ int main(int argc, char** argv) { if (rank == r) { std::cout << "[rank " << rank << "] Finished building extended mesh structure" << std::endl; std::cout << "[rank " << rank << "] - Owned elements: " << final_mesh.num_elems << std::endl; - std::cout << "[rank " << rank << "] - Ghost elements: " << final_mesh.num_ghost_elems << std::endl; + std::cout << "[rank " << rank << "] - Ghost elements: " << ghost_elem_gids.size() << std::endl; std::cout << "[rank " << rank << "] - Total extended elements: " << total_extended_elems << std::endl; std::cout << "[rank " << rank << "] - Owned nodes: " << final_mesh.num_nodes << std::endl; std::cout << "[rank " << rank << "] - Ghost-only nodes: " << ghost_only_nodes.size() << std::endl; @@ -1763,6 +1763,24 @@ int main(int argc, char** argv) { mesh_with_ghosts.local_to_global_node_mapping.update_device(); mesh_with_ghosts.local_to_global_elem_mapping.update_device(); + mesh_with_ghosts.num_ghost_elems = ghost_elem_gids.size(); + mesh_with_ghosts.num_ghost_nodes = ghost_only_nodes.size(); + + // Set owned counts for write_vtk (excludes ghost elements/nodes) + mesh_with_ghosts.num_owned_elems = final_mesh.num_elems; + mesh_with_ghosts.num_owned_nodes = final_mesh.num_nodes; + + + // Print num ghost elements and nodes on each rank sequentially + for (int r = 0; r < world_size; ++r) { + if (rank == r) { + std::cout << "*******[rank " << rank << "] - Ghost elements: " << mesh_with_ghosts.num_ghost_elems << std::endl; + std::cout << "*******[rank " << rank << "] - Ghost-only nodes: " << mesh_with_ghosts.num_ghost_nodes << std::endl; + } + MPI_Barrier(MPI_COMM_WORLD); + } + + MPI_Barrier(MPI_COMM_WORLD); if(rank == 0) std::cout<<" Starting reverse mapping of the element-node connectivity from the global node ids to the local node ids"<(num_elems, num_cell_scalar_vars); int elem_switch = 1; @@ -316,7 +317,7 @@ void build_3d_box( // export material centeric data to the elements - for (size_t elem_gid = 0; elem_gid < mesh.num_elems; elem_gid++) { + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { elem_fields(elem_gid, 0) = rank; elem_fields(elem_gid, 1) = (double)mesh.num_elems_in_elem(elem_gid); elem_fields(elem_gid, 2) = mesh.local_to_global_elem_mapping.host(elem_gid); @@ -334,6 +335,11 @@ void build_3d_box( vec_fields(node_gid, 0, 2) = node.coords.host(node_gid, 2); point_scalar_fields(node_gid, 0) = rank; + point_scalar_fields(node_gid, 1) = (double)mesh.num_corners_in_node(node_gid); + + if(node_gid == 0) { + std::cout << "*******[rank " << rank << "] - num_corners_in_node: " << mesh.num_corners_in_node(node_gid) << std::endl; + } } // end for loop over vertices @@ -362,10 +368,10 @@ void build_3d_box( fprintf(out[0], "ASCII \n"); // part 3 fprintf(out[0], "DATASET UNSTRUCTURED_GRID\n\n"); // part 4 - fprintf(out[0], "POINTS %zu float\n", mesh.num_nodes); + fprintf(out[0], "POINTS %zu float\n", num_nodes); // write all components of the point coordinates - for (size_t node_gid = 0; node_gid < mesh.num_nodes; node_gid++) { + for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) { fprintf(out[0], "%f %f %f\n", node.coords.host(node_gid, 0), @@ -380,7 +386,7 @@ void build_3d_box( */ fprintf(out[0], "\n"); - fprintf(out[0], "CELLS %lu %lu\n", mesh.num_elems, mesh.num_elems + mesh.num_elems * mesh.num_nodes_in_elem); // size=all printed values + fprintf(out[0], "CELLS %lu %lu\n", num_elems, num_elems + num_elems * mesh.num_nodes_in_elem); // size=all printed values int Pn_order = mesh.Pn; int order[3] = { Pn_order, Pn_order, Pn_order }; @@ -388,7 +394,7 @@ void build_3d_box( // const int num_1D_points = Pn_order+1; // write all global point numbers for this elem - for (size_t elem_gid = 0; elem_gid < mesh.num_elems; elem_gid++) { + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { fprintf(out[0], "%lu ", mesh.num_nodes_in_elem); // num points in this elem for (int k = 0; k <= Pn_order; k++) { @@ -405,14 +411,14 @@ void build_3d_box( // Write the element types fprintf(out[0], "\n"); - fprintf(out[0], "CELL_TYPES %zu \n", mesh.num_elems); + fprintf(out[0], "CELL_TYPES %zu \n", num_elems); // VTK_LAGRANGE_HEXAHEDRON: 72, // VTK_HIGHER_ORDER_HEXAHEDRON: 67 // VTK_BIQUADRATIC_QUADRATIC_HEXAHEDRON = 33 // element types: https://vtk.org/doc/nightly/html/vtkCellType_8h_source.html // element types: https://kitware.github.io/vtk-js/api/Common_DataModel_CellTypes.html // vtk format: https://www.kitware.com//modeling-arbitrary-order-lagrange-finite-elements-in-the-visualization-toolkit/ - for (size_t elem_gid = 0; elem_gid < mesh.num_elems; elem_gid++) { + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { fprintf(out[0], "%d \n", 72); } @@ -423,12 +429,12 @@ void build_3d_box( */ fprintf(out[0], "\n"); - fprintf(out[0], "POINT_DATA %zu \n", mesh.num_nodes); + fprintf(out[0], "POINT_DATA %zu \n", num_nodes); // vtk vector vars = (position, velocity) for (int var = 0; var < num_point_vec_vars; var++) { fprintf(out[0], "VECTORS %s float \n", point_vec_var_names[var]); - for (size_t node_gid = 0; node_gid < mesh.num_nodes; node_gid++) { + for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) { fprintf(out[0], "%f %f %f\n", vec_fields(node_gid, var, 0), vec_fields(node_gid, var, 1), @@ -437,15 +443,15 @@ void build_3d_box( } // end for vec_vars - // vtk scalar vars = (temp) + // vtk scalar vars = (rank_id, elems_in_node) for (int var = 0; var < num_point_scalar_vars; var++) { fprintf(out[0], "SCALARS %s float 1\n", point_scalar_var_names[var]); fprintf(out[0], "LOOKUP_TABLE default\n"); - for (size_t node_gid = 0; node_gid < mesh.num_nodes; node_gid++) { + for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) { fprintf(out[0], "%f\n", - point_scalar_fields(node_gid, 0)); + point_scalar_fields(node_gid, var)); } // end for nodes - } // end for vec_vars + } // end for scalar_vars /* --------------------------------------------------------------------------- @@ -453,12 +459,12 @@ void build_3d_box( --------------------------------------------------------------------------- */ fprintf(out[0], "\n"); - fprintf(out[0], "CELL_DATA %zu \n", mesh.num_elems); + fprintf(out[0], "CELL_DATA %zu \n", num_elems); for (int var = 0; var < num_cell_scalar_vars; var++) { fprintf(out[0], "SCALARS %s float 1\n", cell_scalar_var_names[var]); // the 1 is number of scalar components [1:4] fprintf(out[0], "LOOKUP_TABLE default\n"); - for (size_t elem_gid = 0; elem_gid < mesh.num_elems; elem_gid++) { + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { fprintf(out[0], "%f\n", elem_fields(elem_gid, var)); } // end for elem } // end for cell scalar_vars @@ -496,5 +502,270 @@ void build_3d_box( } // end write vtk old +///////////////////////////////////////////////////////////////////////////// +/// +/// \fn write_vtu +/// +/// \brief Writes a VTU (XML VTK) output file per MPI rank and a PVTU file +/// for parallel visualization in ParaView +/// +/// \param mesh mesh +/// \param node node data +/// \param rank MPI rank +/// \param comm MPI communicator +/// +///////////////////////////////////////////////////////////////////////////// +void write_vtu(Mesh_t& mesh, + node_t& node, + int rank, + MPI_Comm comm) +{ + int world_size; + MPI_Comm_size(comm, &world_size); + + CArray graphics_times(1); + int graphics_id = 0; + graphics_times(0) = 0.0; + + // ---- Update host data ---- + node.coords.update_host(); + Kokkos::fence(); + + const int num_cell_scalar_vars = 3; + const int num_cell_vec_vars = 0; + const int num_cell_tensor_vars = 0; + + const int num_point_scalar_vars = 2; + const int num_point_vec_vars = 1; + + // Scalar values associated with a cell + const char cell_scalar_var_names[num_cell_scalar_vars][30] = { + "rank_id", "elems_in_elem_owned", "global_elem_id" + }; + + const char point_scalar_var_names[num_point_scalar_vars][15] = { + "rank_id", "elems_in_node" + }; + + const char point_vec_var_names[num_point_vec_vars][15] = { + "pos" + }; + + // short hand + const size_t num_nodes = mesh.num_owned_nodes; + const size_t num_elems = mesh.num_owned_elems; + const size_t num_dims = mesh.num_dims; + + // save the cell state to an array for exporting to graphics files + auto elem_fields = CArray(num_elems, num_cell_scalar_vars); + + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { + elem_fields(elem_gid, 0) = rank; + elem_fields(elem_gid, 1) = (double)mesh.num_elems_in_elem(elem_gid); + elem_fields(elem_gid, 2) = mesh.local_to_global_elem_mapping.host(elem_gid); + } + + // save the vertex vector fields to an array for exporting to graphics files + CArray vec_fields(num_nodes, num_point_vec_vars, 3); + CArray point_scalar_fields(num_nodes, num_point_scalar_vars); + + for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) { + // position, var 0 + vec_fields(node_gid, 0, 0) = node.coords.host(node_gid, 0); + vec_fields(node_gid, 0, 1) = node.coords.host(node_gid, 1); + vec_fields(node_gid, 0, 2) = node.coords.host(node_gid, 2); + + point_scalar_fields(node_gid, 0) = rank; + point_scalar_fields(node_gid, 1) = (double)mesh.num_corners_in_node(node_gid); + } + + // File management + char filename[200]; + int max_len = sizeof filename; + int str_output_len; + + struct stat st; + if (stat("vtk", &st) != 0) { + system("mkdir vtk"); + } + + // Create VTU filename for this rank + str_output_len = snprintf(filename, max_len, "vtk/Fierro.%05d_rank%d.vtu", graphics_id, rank); + if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); } + + FILE* vtu_file = fopen(filename, "w"); + if (!vtu_file) { + std::cerr << "[rank " << rank << "] Failed to open VTU file: " << filename << std::endl; + return; + } + + // Write VTU XML header + fprintf(vtu_file, "\n"); + fprintf(vtu_file, "\n"); + fprintf(vtu_file, " \n"); + fprintf(vtu_file, " \n", num_nodes, num_elems); + + // Write Points (coordinates) + fprintf(vtu_file, " \n"); + fprintf(vtu_file, " \n"); + for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) { + fprintf(vtu_file, " %f %f %f\n", + node.coords.host(node_gid, 0), + node.coords.host(node_gid, 1), + node.coords.host(node_gid, 2)); + } + fprintf(vtu_file, " \n"); + fprintf(vtu_file, " \n"); + + // Write Cells (connectivity) + fprintf(vtu_file, " \n"); + + // Connectivity array - all node indices for all cells, space-separated + fprintf(vtu_file, " \n"); + int Pn_order = mesh.Pn; + int order[3] = { Pn_order, Pn_order, Pn_order }; + + // Write connectivity: all node IDs for all elements, space-separated + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { + for (int k = 0; k <= Pn_order; k++) { + for (int j = 0; j <= Pn_order; j++) { + for (int i = 0; i <= Pn_order; i++) { + size_t node_lid = PointIndexFromIJK(i, j, k, order); + size_t node_idx = mesh.nodes_in_elem.host(elem_gid, node_lid); + // Cast to int for Int32 format (valid for node indices < 2^31) + fprintf(vtu_file, " %d", static_cast(node_idx)); + } + } + } + } + fprintf(vtu_file, "\n"); + fprintf(vtu_file, " \n"); + + // Offsets array - cumulative index where each cell's connectivity ends + fprintf(vtu_file, " \n"); + int offset = 0; + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { + offset += static_cast(mesh.num_nodes_in_elem); + fprintf(vtu_file, " %d", offset); + } + fprintf(vtu_file, "\n"); + fprintf(vtu_file, " \n"); + + // Types array (72 = VTK_LAGRANGE_HEXAHEDRON) + fprintf(vtu_file, " \n"); + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { + fprintf(vtu_file, " 72"); + } + fprintf(vtu_file, "\n"); + fprintf(vtu_file, " \n"); + fprintf(vtu_file, " \n"); + + // Write PointData (node fields) + fprintf(vtu_file, " \n"); + + // Point vector variables + for (int var = 0; var < num_point_vec_vars; var++) { + fprintf(vtu_file, " \n", + point_vec_var_names[var]); + for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) { + fprintf(vtu_file, " %f %f %f\n", + vec_fields(node_gid, var, 0), + vec_fields(node_gid, var, 1), + vec_fields(node_gid, var, 2)); + } + fprintf(vtu_file, " \n"); + } + + // Point scalar variables + for (int var = 0; var < num_point_scalar_vars; var++) { + fprintf(vtu_file, " \n", + point_scalar_var_names[var]); + for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) { + fprintf(vtu_file, " %f\n", point_scalar_fields(node_gid, var)); + } + fprintf(vtu_file, " \n"); + } + fprintf(vtu_file, " \n"); + + // Write CellData (element fields) + fprintf(vtu_file, " \n"); + for (int var = 0; var < num_cell_scalar_vars; var++) { + fprintf(vtu_file, " \n", + cell_scalar_var_names[var]); + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { + fprintf(vtu_file, " %f\n", elem_fields(elem_gid, var)); + } + fprintf(vtu_file, " \n"); + } + fprintf(vtu_file, " \n"); + + // Close VTU file + fprintf(vtu_file, " \n"); + fprintf(vtu_file, " \n"); + fprintf(vtu_file, "\n"); + fclose(vtu_file); + + // Write PVTU file (only rank 0, after all ranks have written their VTU files) + MPI_Barrier(comm); + + if (rank == 0) { + str_output_len = snprintf(filename, max_len, "vtk/Fierro.%05d.pvtu", graphics_id); + if (str_output_len >= max_len) { fputs("Filename length exceeded; string truncated", stderr); } + + FILE* pvtu_file = fopen(filename, "w"); + if (!pvtu_file) { + std::cerr << "[rank 0] Failed to open PVTU file: " << filename << std::endl; + return; + } + + // Write PVTU XML header + fprintf(pvtu_file, "\n"); + fprintf(pvtu_file, "\n"); + fprintf(pvtu_file, " \n"); + + // Write PPoints + fprintf(pvtu_file, " \n"); + fprintf(pvtu_file, " \n"); + fprintf(pvtu_file, " \n"); + + // Write PCells + fprintf(pvtu_file, " \n"); + fprintf(pvtu_file, " \n"); + fprintf(pvtu_file, " \n"); + fprintf(pvtu_file, " \n"); + fprintf(pvtu_file, " \n"); + + // Write PPointData + fprintf(pvtu_file, " \n"); + for (int var = 0; var < num_point_vec_vars; var++) { + fprintf(pvtu_file, " \n", + point_vec_var_names[var]); + } + for (int var = 0; var < num_point_scalar_vars; var++) { + fprintf(pvtu_file, " \n", + point_scalar_var_names[var]); + } + fprintf(pvtu_file, " \n"); + + // Write PCellData + fprintf(pvtu_file, " \n"); + for (int var = 0; var < num_cell_scalar_vars; var++) { + fprintf(pvtu_file, " \n", + cell_scalar_var_names[var]); + } + fprintf(pvtu_file, " \n"); + + // Write Piece references for each rank + for (int r = 0; r < world_size; r++) { + fprintf(pvtu_file, " \n", graphics_id, r); + } + + // Close PVTU file + fprintf(pvtu_file, " \n"); + fprintf(pvtu_file, "\n"); + fclose(pvtu_file); + } + +} // end write_vtu #endif \ No newline at end of file From 761faef6bd82d2262cb797e53d93322494239c9e Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Fri, 31 Oct 2025 14:51:04 -0500 Subject: [PATCH 13/52] ENH: Cleaning up, WIP --- examples/mesh_decomp/decomp_utils.h | 0 examples/mesh_decomp/mesh_decomp.cpp | 241 ++++++++++++--------------- examples/mesh_decomp/mesh_io.h | 4 +- 3 files changed, 103 insertions(+), 142 deletions(-) create mode 100644 examples/mesh_decomp/decomp_utils.h diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h new file mode 100644 index 00000000..e69de29b diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp index 45ffc2f5..4cc6da9e 100644 --- a/examples/mesh_decomp/mesh_decomp.cpp +++ b/examples/mesh_decomp/mesh_decomp.cpp @@ -26,20 +26,6 @@ void calc_elements_per_rank(std::vector& elems_per_rank, int num_elems, int } } -void print_mesh_info(Mesh_t& mesh){ - std::cout<<"Mesh has "<(num_new_nodes); - final_mesh.local_to_global_elem_mapping = DCArrayKokkos(num_new_elems); + // -------------- Phase 6: Build the intermediate_mesh -------------- + intermediate_mesh.initialize_nodes(num_new_nodes); + intermediate_mesh.initialize_elems(num_new_elems, mesh.num_dims); + intermediate_mesh.local_to_global_node_mapping = DCArrayKokkos(num_new_nodes); + intermediate_mesh.local_to_global_elem_mapping = DCArrayKokkos(num_new_elems); // Fill global mappings for (int i = 0; i < num_new_nodes; ++i) - final_mesh.local_to_global_node_mapping.host(i) = new_node_gids[i]; + intermediate_mesh.local_to_global_node_mapping.host(i) = new_node_gids[i]; for (int i = 0; i < num_new_elems; ++i) - final_mesh.local_to_global_elem_mapping.host(i) = new_elem_gids[i]; + intermediate_mesh.local_to_global_elem_mapping.host(i) = new_elem_gids[i]; - final_mesh.local_to_global_node_mapping.update_device(); - final_mesh.local_to_global_elem_mapping.update_device(); + intermediate_mesh.local_to_global_node_mapping.update_device(); + intermediate_mesh.local_to_global_elem_mapping.update_device(); MPI_Barrier(MPI_COMM_WORLD); @@ -1325,7 +1309,7 @@ int main(int argc, char** argv) { int left = 0, right = num_new_nodes - 1; while (left <= right) { int mid = left + (right - left) / 2; - size_t mid_gid = final_mesh.local_to_global_node_mapping.host(mid); + size_t mid_gid = intermediate_mesh.local_to_global_node_mapping.host(mid); if (node_gid == mid_gid) { node_lid = mid; break; @@ -1336,14 +1320,14 @@ int main(int argc, char** argv) { } } - final_mesh.nodes_in_elem.host(i, j) = node_lid; + intermediate_mesh.nodes_in_elem.host(i, j) = node_lid; } } MPI_Barrier(MPI_COMM_WORLD); if(rank == 0) std::cout<<" Finished reverse mapping of the element-node connectivity from the global node ids to the local node ids"<second[0]; - final_node.coords.host(i, 1) = it->second[1]; - final_node.coords.host(i, 2) = it->second[2]; + intermediate_node.coords.host(i, 0) = it->second[0]; + intermediate_node.coords.host(i, 1) = it->second[1]; + intermediate_node.coords.host(i, 2) = it->second[2]; } } - final_node.coords.update_device(); + intermediate_node.coords.update_device(); // Connectivity rebuild - final_mesh.build_connectivity(); + intermediate_mesh.build_connectivity(); MPI_Barrier(MPI_COMM_WORLD); @@ -1401,7 +1385,7 @@ int main(int argc, char** argv) { // MPI_Datatype recvtype, // Type of receive data // MPI_Comm comm // Communicator // ); - MPI_Allgather(&final_mesh.num_elems, 1, MPI_INT, elem_counts.data(), 1, MPI_INT, MPI_COMM_WORLD); + MPI_Allgather(&intermediate_mesh.num_elems, 1, MPI_INT, elem_counts.data(), 1, MPI_INT, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); // Compute displacements std::vector elem_displs(world_size); @@ -1424,7 +1408,7 @@ int main(int argc, char** argv) { // MPI_Datatype recvtype, // Type of receive data // MPI_Comm comm // Communicator // ); - MPI_Allgatherv(final_mesh.local_to_global_elem_mapping.host_pointer(), final_mesh.num_elems, MPI_UNSIGNED_LONG_LONG, + MPI_Allgatherv(intermediate_mesh.local_to_global_elem_mapping.host_pointer(), intermediate_mesh.num_elems, MPI_UNSIGNED_LONG_LONG, all_elem_gids.data(), elem_counts.data(), elem_displs.data(), MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); @@ -1447,8 +1431,8 @@ int main(int argc, char** argv) { std::set local_elem_nodes; for (int lid = 0; lid < num_new_elems; ++lid) { for (int j = 0; j < nodes_per_elem; j++) { - size_t node_lid = final_mesh.nodes_in_elem.host(lid, j); - size_t node_gid = final_mesh.local_to_global_node_mapping.host(node_lid); + size_t node_lid = intermediate_mesh.nodes_in_elem.host(lid, j); + size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid); local_elem_nodes.insert(node_gid); } } @@ -1459,10 +1443,10 @@ int main(int argc, char** argv) { int local_conn_size = 0; for (int lid = 0; lid < num_new_elems; ++lid) { - size_t elem_gid = final_mesh.local_to_global_elem_mapping.host(lid); + size_t elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(lid); for (int j = 0; j < nodes_per_elem; j++) { - size_t node_lid = final_mesh.nodes_in_elem.host(lid, j); - size_t node_gid = final_mesh.local_to_global_node_mapping.host(node_lid); + size_t node_lid = intermediate_mesh.nodes_in_elem.host(lid, j); + size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid); elem_node_conn.push_back(elem_gid); elem_node_conn.push_back(node_gid); } @@ -1491,7 +1475,7 @@ int main(int argc, char** argv) { // create a set for local_elem_gids std::set local_elem_gids; for (int i = 0; i < num_new_elems; ++i) { - local_elem_gids.insert(final_mesh.local_to_global_elem_mapping.host(i)); + local_elem_gids.insert(intermediate_mesh.local_to_global_elem_mapping.host(i)); } // Build a map: node GID -> set of element GIDs that contain it (from other ranks) @@ -1528,13 +1512,13 @@ int main(int argc, char** argv) { // but are owned by other ranks (these might already be in ghost_elem_gids, but check connectivity) for (int lid = 0; lid < num_new_elems; ++lid) { - size_t num_neighbors = final_mesh.num_elems_in_elem(lid); + size_t num_neighbors = intermediate_mesh.num_elems_in_elem(lid); for (size_t nbr_idx = 0; nbr_idx < num_neighbors; ++nbr_idx) { - size_t neighbor_lid = final_mesh.elems_in_elem(lid, nbr_idx); + size_t neighbor_lid = intermediate_mesh.elems_in_elem(lid, nbr_idx); if (neighbor_lid < static_cast(num_new_elems)) { - size_t neighbor_gid = final_mesh.local_to_global_elem_mapping(neighbor_lid); + size_t neighbor_gid = intermediate_mesh.local_to_global_elem_mapping(neighbor_lid); // Check if neighbor is owned by this rank auto it = elem_gid_to_rank.find(neighbor_gid); @@ -1547,7 +1531,7 @@ int main(int argc, char** argv) { } // Count unique ghost elements - final_mesh.num_ghost_elems = ghost_elem_gids.size(); + intermediate_mesh.num_ghost_elems = ghost_elem_gids.size(); MPI_Barrier(MPI_COMM_WORLD); double t_ghost_end = MPI_Wtime(); @@ -1563,10 +1547,10 @@ int main(int argc, char** argv) { MPI_Barrier(MPI_COMM_WORLD); if(rank == i && print_info) { std::cout << "[rank " << rank << "] owns " << num_new_elems - << " elements and has " << final_mesh.num_ghost_elems << " ghost elements" << std::endl; + << " elements and has " << intermediate_mesh.num_ghost_elems << " ghost elements" << std::endl; std::cout << "[rank " << rank << "] owned element global IDs: "; - for (int j = 0; j < final_mesh.num_elems; j++) { - std::cout << final_mesh.local_to_global_elem_mapping(j) << " "; + for (int j = 0; j < intermediate_mesh.num_elems; j++) { + std::cout << intermediate_mesh.local_to_global_elem_mapping(j) << " "; } // Print global IDs of ghost elements @@ -1629,8 +1613,8 @@ int main(int argc, char** argv) { int extended_node_lid = 0; // Add all owned nodes - for (int i = 0; i < final_mesh.num_nodes; ++i) { - size_t node_gid = final_mesh.local_to_global_node_mapping.host(i); + for (int i = 0; i < intermediate_mesh.num_nodes; ++i) { + size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(i); node_gid_to_extended_lid[node_gid] = extended_node_lid++; } @@ -1683,22 +1667,22 @@ int main(int argc, char** argv) { // Ghost elements: num_new_elems to num_new_elems + num_ghost_elems - 1 // Create extended element-node connectivity array - int total_extended_elems = final_mesh.num_elems + final_mesh.num_ghost_elems; + int total_extended_elems = intermediate_mesh.num_elems + intermediate_mesh.num_ghost_elems; std::vector> extended_nodes_in_elem(total_extended_elems); // Copy owned element connectivity (convert to extended node LIDs) - for (int lid = 0; lid < final_mesh.num_elems; ++lid) { + for (int lid = 0; lid < intermediate_mesh.num_elems; ++lid) { extended_nodes_in_elem[lid].reserve(nodes_per_elem); for (int j = 0; j < nodes_per_elem; j++) { - size_t node_lid = final_mesh.nodes_in_elem.host(lid, j); - size_t node_gid = final_mesh.local_to_global_node_mapping.host(node_lid); + size_t node_lid = intermediate_mesh.nodes_in_elem.host(lid, j); + size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid); int ext_lid = node_gid_to_extended_lid[node_gid]; extended_nodes_in_elem[lid].push_back(ext_lid); } } // Add ghost element connectivity (map ghost node GIDs to extended node LIDs) - int ghost_elem_ext_lid = final_mesh.num_elems; + int ghost_elem_ext_lid = intermediate_mesh.num_elems; std::vector ghost_elem_gids_ordered(ghost_elem_gids.begin(), ghost_elem_gids.end()); std::sort(ghost_elem_gids_ordered.begin(), ghost_elem_gids_ordered.end()); @@ -1719,10 +1703,10 @@ int main(int argc, char** argv) { for (int r = 0; r < world_size; ++r) { if (rank == r) { std::cout << "[rank " << rank << "] Finished building extended mesh structure" << std::endl; - std::cout << "[rank " << rank << "] - Owned elements: " << final_mesh.num_elems << std::endl; + std::cout << "[rank " << rank << "] - Owned elements: " << intermediate_mesh.num_elems << std::endl; std::cout << "[rank " << rank << "] - Ghost elements: " << ghost_elem_gids.size() << std::endl; std::cout << "[rank " << rank << "] - Total extended elements: " << total_extended_elems << std::endl; - std::cout << "[rank " << rank << "] - Owned nodes: " << final_mesh.num_nodes << std::endl; + std::cout << "[rank " << rank << "] - Owned nodes: " << intermediate_mesh.num_nodes << std::endl; std::cout << "[rank " << rank << "] - Ghost-only nodes: " << ghost_only_nodes.size() << std::endl; std::cout << "[rank " << rank << "] - Total extended nodes: " << total_extended_nodes << std::endl; std::cout << std::flush; @@ -1742,40 +1726,40 @@ int main(int argc, char** argv) { // Build extended element GID list: owned first, then ghost std::vector extended_lid_to_elem_gid(total_extended_elems); // Owned elements - for (int i = 0; i < final_mesh.num_elems; ++i) { - extended_lid_to_elem_gid[i] = final_mesh.local_to_global_elem_mapping.host(i); + for (int i = 0; i < intermediate_mesh.num_elems; ++i) { + extended_lid_to_elem_gid[i] = intermediate_mesh.local_to_global_elem_mapping.host(i); } // Ghost elements (in sorted order) for (size_t idx = 0; idx < ghost_elem_gids_ordered.size(); ++idx) { - extended_lid_to_elem_gid[final_mesh.num_elems + idx] = ghost_elem_gids_ordered[idx]; + extended_lid_to_elem_gid[intermediate_mesh.num_elems + idx] = ghost_elem_gids_ordered[idx]; } - mesh_with_ghosts.initialize_nodes(total_extended_nodes); - mesh_with_ghosts.initialize_elems(total_extended_elems, 3); - mesh_with_ghosts.local_to_global_node_mapping = DCArrayKokkos(total_extended_nodes); - mesh_with_ghosts.local_to_global_elem_mapping = DCArrayKokkos(total_extended_elems); + final_mesh.initialize_nodes(total_extended_nodes); + final_mesh.initialize_elems(total_extended_elems, 3); + final_mesh.local_to_global_node_mapping = DCArrayKokkos(total_extended_nodes); + final_mesh.local_to_global_elem_mapping = DCArrayKokkos(total_extended_elems); for (int i = 0; i < total_extended_nodes; i++) { - mesh_with_ghosts.local_to_global_node_mapping.host(i) = extended_lid_to_node_gid[i]; + final_mesh.local_to_global_node_mapping.host(i) = extended_lid_to_node_gid[i]; } for (int i = 0; i < total_extended_elems; i++) { - mesh_with_ghosts.local_to_global_elem_mapping.host(i) = extended_lid_to_elem_gid[i]; + final_mesh.local_to_global_elem_mapping.host(i) = extended_lid_to_elem_gid[i]; } - mesh_with_ghosts.local_to_global_node_mapping.update_device(); - mesh_with_ghosts.local_to_global_elem_mapping.update_device(); + final_mesh.local_to_global_node_mapping.update_device(); + final_mesh.local_to_global_elem_mapping.update_device(); - mesh_with_ghosts.num_ghost_elems = ghost_elem_gids.size(); - mesh_with_ghosts.num_ghost_nodes = ghost_only_nodes.size(); + final_mesh.num_ghost_elems = ghost_elem_gids.size(); + final_mesh.num_ghost_nodes = ghost_only_nodes.size(); // Set owned counts for write_vtk (excludes ghost elements/nodes) - mesh_with_ghosts.num_owned_elems = final_mesh.num_elems; - mesh_with_ghosts.num_owned_nodes = final_mesh.num_nodes; + final_mesh.num_owned_elems = intermediate_mesh.num_elems; + final_mesh.num_owned_nodes = intermediate_mesh.num_nodes; // Print num ghost elements and nodes on each rank sequentially for (int r = 0; r < world_size; ++r) { if (rank == r) { - std::cout << "*******[rank " << rank << "] - Ghost elements: " << mesh_with_ghosts.num_ghost_elems << std::endl; - std::cout << "*******[rank " << rank << "] - Ghost-only nodes: " << mesh_with_ghosts.num_ghost_nodes << std::endl; + std::cout << "*******[rank " << rank << "] - Ghost elements: " << final_mesh.num_ghost_elems << std::endl; + std::cout << "*******[rank " << rank << "] - Ghost-only nodes: " << final_mesh.num_ghost_nodes << std::endl; } MPI_Barrier(MPI_COMM_WORLD); } @@ -1790,16 +1774,16 @@ int main(int argc, char** argv) { // extended_nodes_in_elem already contains extended local node IDs, so we can use them directly for(int i = 0; i < total_extended_elems; i++) { for(int j = 0; j < nodes_per_elem; j++) { - mesh_with_ghosts.nodes_in_elem.host(i, j) = extended_nodes_in_elem[i][j]; + final_mesh.nodes_in_elem.host(i, j) = extended_nodes_in_elem[i][j]; } } MPI_Barrier(MPI_COMM_WORLD); if(rank == 0) std::cout<<" Finished reverse mapping of the element-node connectivity from the global node ids to the local node ids"< coordinate. - // 3. Use this map to fill node_with_ghosts.coords. + // 3. Use this map to fill final_node.coords. // 1. Build list of all global node IDs needed on this rank (owned + ghosts) std::vector all_needed_node_gids(total_extended_nodes); for (int i = 0; i < total_extended_nodes; ++i) { - all_needed_node_gids[i] = mesh_with_ghosts.local_to_global_node_mapping.host(i); + all_needed_node_gids[i] = final_mesh.local_to_global_node_mapping.host(i); } // 2. Build owned node GIDs and their coordinates - std::vector owned_gids(final_mesh.num_nodes); + std::vector owned_gids(intermediate_mesh.num_nodes); for (int i = 0; i < owned_gids.size(); ++i) - owned_gids[i] = final_mesh.local_to_global_node_mapping.host(i); + owned_gids[i] = intermediate_mesh.local_to_global_node_mapping.host(i); // 3. Gather all GIDs in the world that are needed anywhere (owned or ghosted, by any rank) // so we can distribute the needed coordinate data. @@ -1861,9 +1845,9 @@ int main(int argc, char** argv) { // d) Global coords (size: total_owned x 3) std::vector owned_coords_send(3*local_owned_count, 0.0); for (int i=0; i all_owned_coords(3 * total_owned, 0.0); @@ -1890,22 +1874,22 @@ int main(int argc, char** argv) { gid_to_coord[all_owned_gids[i]] = xyz; } - // 4. Finally, fill node_with_ghosts.coords with correct coordinates. + // 4. Finally, fill final_node.coords with correct coordinates. for (int i = 0; i < total_extended_nodes; ++i) { - size_t gid = mesh_with_ghosts.local_to_global_node_mapping.host(i); + size_t gid = final_mesh.local_to_global_node_mapping.host(i); auto it = gid_to_coord.find(gid); if (it != gid_to_coord.end()) { - node_with_ghosts.coords.host(i,0) = it->second[0]; - node_with_ghosts.coords.host(i,1) = it->second[1]; - node_with_ghosts.coords.host(i,2) = it->second[2]; + final_node.coords.host(i,0) = it->second[0]; + final_node.coords.host(i,1) = it->second[1]; + final_node.coords.host(i,2) = it->second[2]; } else { // Could happen if there's a bug: fill with zeros for safety - node_with_ghosts.coords.host(i,0) = 0.0; - node_with_ghosts.coords.host(i,1) = 0.0; - node_with_ghosts.coords.host(i,2) = 0.0; + final_node.coords.host(i,0) = 0.0; + final_node.coords.host(i,1) = 0.0; + final_node.coords.host(i,2) = 0.0; } } - node_with_ghosts.coords.update_device(); + final_node.coords.update_device(); @@ -1917,7 +1901,7 @@ int main(int argc, char** argv) { // 2) Allgatherv ghost GIDs to build gid -> [ranks that ghost it]. // 3) For each locally-owned element gid, lookup ranks that ghost it and record targets. // -------------------------------------------------------------------------------------- - std::vector>> boundary_elem_targets(final_mesh.num_elems); + std::vector>> boundary_elem_targets(intermediate_mesh.num_elems); // Prepare local ghost list as vector std::vector ghost_gids_vec; @@ -1962,8 +1946,8 @@ int main(int argc, char** argv) { } // For each local element, list destinations: ranks that ghost our gid - for (int elem_lid = 0; elem_lid < final_mesh.num_elems; elem_lid++) { - size_t local_elem_gid = final_mesh.local_to_global_elem_mapping.host(elem_lid); + for (int elem_lid = 0; elem_lid < intermediate_mesh.num_elems; elem_lid++) { + size_t local_elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(elem_lid); auto it = gid_to_ghosting_ranks.find(local_elem_gid); if (it == gid_to_ghosting_ranks.end()) continue; const std::vector &dest_ranks = it->second; @@ -1979,9 +1963,9 @@ int main(int argc, char** argv) { for(int i = 0; i < world_size; i++) { if (rank == i && print_info) { std::cout << std::endl; - for (int elem_lid = 0; elem_lid < final_mesh.num_elems; elem_lid++) { + for (int elem_lid = 0; elem_lid < intermediate_mesh.num_elems; elem_lid++) { - size_t local_elem_gid = final_mesh.local_to_global_elem_mapping.host(elem_lid); + size_t local_elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(elem_lid); if (boundary_elem_targets[elem_lid].empty()) { std::cout << "[rank " << rank << "] " << "elem_lid: "<< elem_lid <<" - elem_gid: " << local_elem_gid << " sends to: no ghost elements" << std::endl; @@ -2010,41 +1994,20 @@ int main(int argc, char** argv) { -// NOTES: -// We need to create communication maps for nodes, specifically an index list of -// -- Owned (nodes unique to this rank) -// -- Shared (nodes on the boundary of this rank) -// -- Ghost (nodes on the boundary of this rank that are owned by other ranks) - - -// What we currently have is a communication plan for elements, eg. Each shared element (element on an MPI boundary) knows which rank and associated element global id on that rank it is connected to. - - - - - - - - - - - - - for(int i = 0; i < world_size; i++) { if(rank == i && print_info) { - print_rank_mesh_info(final_mesh, i); + print_rank_mesh_info(intermediate_mesh, i); } MPI_Barrier(MPI_COMM_WORLD); } MPI_Barrier(MPI_COMM_WORLD); - // write_vtk(final_mesh, final_node, rank); - write_vtu(mesh_with_ghosts, node_with_ghosts, rank, MPI_COMM_WORLD); + // write_vtk(intermediate_mesh, intermediate_node, rank); + write_vtu(final_mesh, final_node, rank, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h index c1be0881..7e6f6c83 100644 --- a/examples/mesh_decomp/mesh_io.h +++ b/examples/mesh_decomp/mesh_io.h @@ -631,9 +631,7 @@ void write_vtu(Mesh_t& mesh, for (int j = 0; j <= Pn_order; j++) { for (int i = 0; i <= Pn_order; i++) { size_t node_lid = PointIndexFromIJK(i, j, k, order); - size_t node_idx = mesh.nodes_in_elem.host(elem_gid, node_lid); - // Cast to int for Int32 format (valid for node indices < 2^31) - fprintf(vtu_file, " %d", static_cast(node_idx)); + fprintf(vtu_file, " %zu", static_cast(mesh.nodes_in_elem.host(elem_gid, node_lid))); } } } From bdd0c1928301c8c43f9b292b1a85e790d24c13ca Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Fri, 31 Oct 2025 15:53:20 -0500 Subject: [PATCH 14/52] ENH: Tidying up main --- examples/mesh_decomp/decomp_utils.h | 1956 +++++++++++++++++++++++++ examples/mesh_decomp/mesh_decomp.cpp | 1987 +------------------------- examples/mesh_decomp/mesh_io.h | 20 - 3 files changed, 1976 insertions(+), 1987 deletions(-) diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h index e69de29b..0357b6a6 100644 --- a/examples/mesh_decomp/decomp_utils.h +++ b/examples/mesh_decomp/decomp_utils.h @@ -0,0 +1,1956 @@ +#ifndef DECOMP_UTILS_H +#define DECOMP_UTILS_H + +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "mesh.h" +#include "state.h" +#include "mesh_io.h" + +// Include Scotch headers +#include "scotch.h" +#include "ptscotch.h" + +void partition_mesh( + Mesh_t& initial_mesh, + Mesh_t& final_mesh, + node_t& initial_node, + node_t& final_node, + int world_size, + int rank){ + + bool print_info = false; + bool print_vtk = false; + + // Create mesh, gauss points, and node data structures on each rank + // This is the initial partitioned mesh + Mesh_t naive_mesh; + node_t naive_node; + + // Mesh partitioned by pt-scotch, not including ghost + Mesh_t intermediate_mesh; + node_t intermediate_node; + + int num_elements_on_rank = 0; + int num_nodes_on_rank = 0; + + int num_nodes_per_elem = 0; + + std::vector elements_on_rank; + std::vector nodes_on_rank; + + + std::vector elems_per_rank(world_size); // number of elements to send to each rank size(world_size) + std::vector nodes_per_rank(world_size); // number of nodes to send to each rank size(world_size) + + // create a 2D vector of elements to send to each rank + std::vector> elements_to_send(world_size); + + // create a 2D vector of nodes to send to each rank + std::vector> nodes_to_send(world_size); + + // Create a 2D vector to hold the nodal positions on each rank + std::vector> node_pos_to_send(world_size); + + // create a 2D vector to hold the node positions on each rank + std::vector> node_pos_on_rank(world_size); + + + + + if (rank == 0) { + + num_nodes_per_elem = initial_mesh.num_nodes_in_elem; + + // Compute elements to send to each rank; handle remainders for non-even distribution + + // Compute elements to send to each rank; handle remainders for non-even distribution + std::fill(elems_per_rank.begin(), elems_per_rank.end(), initial_mesh.num_elems / world_size); + int remainder = initial_mesh.num_elems % world_size; + for (int i = 0; i < remainder; ++i) { + elems_per_rank[i] += 1; + } + } + + MPI_Bcast(&num_nodes_per_elem, 1, MPI_INT, 0, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + +// ******************************************************** +// Scatter the number of elements to each rank +// ******************************************************** + // All ranks participate in the scatter operation + // MPI_Scatter signature: + // MPI_Scatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype, + // void *recvbuf, int recvcount, MPI_Datatype recvtype, + // int root, MPI_Comm comm) + double t_scatter_start = MPI_Wtime(); + MPI_Scatter(elems_per_rank.data(), 1, MPI_INT, + &num_elements_on_rank, 1, MPI_INT, + 0, MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); + + // Resize the elements_on_rank vector to hold the received data + elements_on_rank.resize(num_elements_on_rank); + + + MPI_Barrier(MPI_COMM_WORLD); + double t_scatter_end = MPI_Wtime(); + if(rank == 0) { + std::cout<<" Finished scattering the number of elements to each rank"< all_elements; + std::vector sendcounts(world_size); + std::vector displs(world_size); + + int displacement = 0; + for (int i = 0; i < world_size; i++) { + sendcounts[i] = elems_per_rank[i]; + displs[i] = displacement; + // Copy elements for rank i to the flattened array + for (int j = 0; j < elems_per_rank[i]; j++) { + all_elements.push_back(elements_to_send[i][j]); + } + displacement += elems_per_rank[i]; + } + + // Send the elements to each rank + MPI_Scatterv(all_elements.data(), sendcounts.data(), displs.data(), MPI_INT, + elements_on_rank.data(), num_elements_on_rank, MPI_INT, + 0, MPI_COMM_WORLD); + } + else { + MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT, + elements_on_rank.data(), num_elements_on_rank, MPI_INT, + 0, MPI_COMM_WORLD); + } + + MPI_Barrier(MPI_COMM_WORLD); + double t_scatter_gids_end = MPI_Wtime(); + if(rank == 0) { + std::cout<<" Finished scattering the actual element global ids to each rank"< nodes_set; + for (int j = 0; j < elems_per_rank[i]; j++) { + for (int k = 0; k < num_nodes_per_elem; k++) { + nodes_set.insert(initial_mesh.nodes_in_elem.host(elements_to_send[i][j], k)); + } + } + nodes_to_send[i] = std::vector(nodes_set.begin(), nodes_set.end()); + } + + for (int i = 0; i < world_size; i++) { + nodes_per_rank[i] = nodes_to_send[i].size(); + } + + if (print_info) { + + + std::cout< all_nodes; + std::vector sendcounts(world_size); + std::vector displs(world_size); + + int displacement = 0; + for (int i = 0; i < world_size; i++) { + sendcounts[i] = nodes_to_send[i].size(); + displs[i] = displacement; + // Copy nodes for rank i to the flattened array + for (int j = 0; j < nodes_to_send[i].size(); j++) { + all_nodes.push_back(nodes_to_send[i][j]); + } + displacement += nodes_to_send[i].size(); + } + // Send the nodes to each rank + // all_nodes.data(): Pointer to the flattened array of all nodes to be sent to each rank + // sendcounts.data(): Array with the number of nodes to send to each rank + // displs.data(): Array with the displacement for each rank in the flattened array + // MPI_INT: Data type of the nodes (integer) + // nodes_on_rank.data(): Pointer to the buffer where each rank will receive its nodes + // num_nodes_on_rank: Number of nodes that the receiving rank expects to receive + // MPI_INT: Data type of the receive buffer (integer) + // 0: The root rank (rank 0) that is performing the scatter + // MPI_COMM_WORLD: The communicator + MPI_Scatterv(all_nodes.data(), sendcounts.data(), displs.data(), MPI_INT, + nodes_on_rank.data(), num_nodes_on_rank, MPI_INT, + 0, MPI_COMM_WORLD); + } + else { + MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT, + nodes_on_rank.data(), num_nodes_on_rank, MPI_INT, + 0, MPI_COMM_WORLD); + } + + MPI_Barrier(MPI_COMM_WORLD); + + // Timer: End measuring time for scattering node global ids + double t_scatter_nodeids_end = MPI_Wtime(); + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) { + std::cout<<" Finished scattering the actual node global ids to each rank"< node_pos_on_rank_flat(num_nodes_on_rank * 3); + + // Timer for scattering node positions + double t_scatter_nodepos_start = MPI_Wtime(); + + if(rank == 0) + { + for (int i = 0; i < world_size; i++) { + for(int node_gid = 0; node_gid < nodes_to_send[i].size(); node_gid++) + { + node_pos_to_send[i].push_back(initial_node.coords.host(nodes_to_send[i][node_gid], 0)); + node_pos_to_send[i].push_back(initial_node.coords.host(nodes_to_send[i][node_gid], 1)); + node_pos_to_send[i].push_back(initial_node.coords.host(nodes_to_send[i][node_gid], 2)); + } + } + + // Prepare data for MPI_Scatterv (scatter with variable counts) + // Flatten the 2D node_pos_to_send into a 1D array + std::vector all_node_pos; + std::vector sendcounts(world_size); + std::vector displs(world_size); + + int displacement = 0; + for (int i = 0; i < world_size; i++) { + sendcounts[i] = nodes_to_send[i].size() * 3; + displs[i] = displacement; // displacement is the starting index of the nodes for the current rank in the flattened array + // Copy node positions for rank i to the flattened array + for(int j = 0; j < nodes_to_send[i].size(); j++) { + for(int k = 0; k < 3; k++) { + all_node_pos.push_back(node_pos_to_send[i][j * 3 + k]); + } + } + displacement += nodes_to_send[i].size() * 3; + } + + // Send the node positions to each rank + MPI_Scatterv(all_node_pos.data(), sendcounts.data(), displs.data(), MPI_DOUBLE, + node_pos_on_rank_flat.data(), num_nodes_on_rank * 3, MPI_DOUBLE, + 0, MPI_COMM_WORLD); + } + else { + MPI_Scatterv(nullptr, nullptr, nullptr, MPI_DOUBLE, + node_pos_on_rank_flat.data(), num_nodes_on_rank * 3, MPI_DOUBLE, + 0, MPI_COMM_WORLD); + } + + MPI_Barrier(MPI_COMM_WORLD); + + if (rank == 0 && print_info) { + // Print out the node positions on this rank + std::cout << "Rank " << rank << " received node positions: "; + for (int i = 0; i < num_nodes_on_rank; i++) { + std::cout << "(" << node_pos_on_rank_flat[i*3] << ", " + << node_pos_on_rank_flat[i*3+1] << ", " + << node_pos_on_rank_flat[i*3+2] << ") "; + } + std::cout << std::endl; + } + + + MPI_Barrier(MPI_COMM_WORLD); + + if (rank == 1 && print_info) { + // Print out the node positions on this rank + std::cout << "Rank " << rank << " received node positions: "; + for (int i = 0; i < num_nodes_on_rank; i++) { + std::cout << "(" << node_pos_on_rank_flat[i*3] << ", " + << node_pos_on_rank_flat[i*3+1] << ", " + << node_pos_on_rank_flat[i*3+2] << ") "; + } + std::cout << std::endl; + } + + MPI_Barrier(MPI_COMM_WORLD); + + double t_scatter_nodepos_end = MPI_Wtime(); + if(rank == 0) { + std::cout<<" Finished scattering the node positions to each rank"< required_node_state = { node_state::coords }; + naive_node.initialize(num_nodes_on_rank, 3, required_node_state); + + for(int i = 0; i < num_nodes_on_rank; i++) { + naive_node.coords.host(i, 0) = node_pos_on_rank_flat[i*3]; + naive_node.coords.host(i, 1) = node_pos_on_rank_flat[i*3+1]; + naive_node.coords.host(i, 2) = node_pos_on_rank_flat[i*3+2]; + } + + naive_node.coords.update_device(); + + +// ****************************************************************************************** +// Send the element-node connectivity data from the initial mesh to each rank +// ****************************************************************************************** + + // Send the element-node connectivity data from the initial mesh to each rank + std::vector nodes_in_elem_on_rank(num_elements_on_rank * num_nodes_per_elem); + + double t_scatter_elemnode_start = MPI_Wtime(); + + if (rank == 0) { + // Prepare element-node connectivity data for each rank + std::vector all_nodes_in_elem; + std::vector sendcounts(world_size); + std::vector displs(world_size); + + int displacement = 0; + for(int i = 0; i < world_size; i++) { + int num_connectivity_entries = elements_to_send[i].size() * num_nodes_per_elem; // num_nodes_per_elem nodes per element + sendcounts[i] = num_connectivity_entries; + displs[i] = displacement; + + // Copy element-node connectivity for rank i + for(int j = 0; j < elements_to_send[i].size(); j++) { + for(int k = 0; k < num_nodes_per_elem; k++) { + all_nodes_in_elem.push_back(initial_mesh.nodes_in_elem.host(elements_to_send[i][j], k)); + } + } + displacement += num_connectivity_entries; + } + // Send the connectivity data to each rank + MPI_Scatterv(all_nodes_in_elem.data(), sendcounts.data(), displs.data(), MPI_INT, + nodes_in_elem_on_rank.data(), num_elements_on_rank * num_nodes_per_elem, MPI_INT, + 0, MPI_COMM_WORLD); + } + else { + MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT, + nodes_in_elem_on_rank.data(), num_elements_on_rank * num_nodes_per_elem, MPI_INT, + 0, MPI_COMM_WORLD); + } + + MPI_Barrier(MPI_COMM_WORLD); + + double t_scatter_elemnode_end = MPI_Wtime(); + if(rank == 0) { + std::cout << " Finished scattering the element-node connectivity data from the initial mesh to each rank" << std::endl; + std::cout << " Scattering element-node connectivity took " + << (t_scatter_elemnode_end - t_scatter_elemnode_start) << " seconds." << std::endl; + } + + if (rank == 0 && print_info) { + + std::cout << "Rank " << rank << " received element-node connectivity (" + << num_elements_on_rank << " elements, " << nodes_in_elem_on_rank.size() << " entries):" << std::endl; + for (int elem = 0; elem < num_elements_on_rank; elem++) { + std::cout << " Element " << elem << " nodes: "; + for (int node = 0; node < num_nodes_per_elem; node++) { + int idx = elem * num_nodes_per_elem + node; + std::cout << nodes_in_elem_on_rank[idx] << " "; + } + std::cout << std::endl; + } + } + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" Finished scattering the element-node connectivity data from the initial mesh to each rank"< elem_elem_counts(world_size); + int total_elem_elem_entries = 0; + + + double t_scatter_elem_elem_start = MPI_Wtime(); + + if (rank == 0){ + // Calculate total number of connectivity entries for each rank + for(int i = 0; i < world_size; i++) { + elem_elem_counts[i] = 0; + for(int k = 0; k < elements_to_send[i].size(); k++) { + elem_elem_counts[i] += initial_mesh.num_elems_in_elem(elements_to_send[i][k]); + } + + if(print_info) std::cout << "Rank " << i << " will receive " << elem_elem_counts[i] << " element-element connectivity entries" << std::endl; + } + + // Print element-element connectivity entries for each rank in the initial mesh + if(print_info) { + for(int i = 0; i < world_size; i++) { + std::cout << std::endl; + std::cout << "Rank " << i << " will receive element-element connectivity entries for the following elements: "< elems_in_elem_on_rank(total_elem_elem_entries); + + // Now scatter the num_elems_in_elem for each element on each rank + std::vector num_elems_in_elem_per_rank(num_elements_on_rank); + + if (rank == 0) { + std::vector all_num_elems_in_elem; + std::vector displs_ee(world_size); + int displacement = 0; + + for(int i = 0; i < world_size; i++) { + displs_ee[i] = displacement; + for(int k = 0; k < elements_to_send[i].size(); k++) { + all_num_elems_in_elem.push_back(initial_mesh.num_elems_in_elem(elements_to_send[i][k])); + } + displacement += elements_to_send[i].size(); + } + + MPI_Scatterv(all_num_elems_in_elem.data(), elems_per_rank.data(), displs_ee.data(), MPI_INT, + num_elems_in_elem_per_rank.data(), num_elements_on_rank, MPI_INT, + 0, MPI_COMM_WORLD); + } else { + MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT, + num_elems_in_elem_per_rank.data(), num_elements_on_rank, MPI_INT, + 0, MPI_COMM_WORLD); + } + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" Finished scattering the actual element-element connectivity counts per element to each rank"< all_elems_in_elem; + std::vector sendcounts(world_size); + std::vector displs(world_size); + + int displacement = 0; + + for(int i = 0; i < world_size; i++) { + sendcounts[i] = elem_elem_counts[i]; + displs[i] = displacement; + + // Copy element-element connectivity for rank i + for(int k = 0; k < elements_to_send[i].size(); k++) { + for(int l = 0; l < initial_mesh.num_elems_in_elem(elements_to_send[i][k]); l++) { + all_elems_in_elem.push_back(initial_mesh.elems_in_elem(elements_to_send[i][k], l)); + } + } + displacement += elem_elem_counts[i]; + } + + // Send the element-element connectivity data to each rank using MPI_Scatterv + MPI_Scatterv(all_elems_in_elem.data(), sendcounts.data(), displs.data(), MPI_INT, + elems_in_elem_on_rank.data(), total_elem_elem_entries, MPI_INT, + 0, MPI_COMM_WORLD); + } + else { + MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT, + elems_in_elem_on_rank.data(), total_elem_elem_entries, MPI_INT, + 0, MPI_COMM_WORLD); + } + + MPI_Barrier(MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" Finished receiving the actual element-element connectivity entries to each rank"<(num_nodes_on_rank, "naive_mesh.local_to_global_node_mapping"); + naive_mesh.local_to_global_elem_mapping = DCArrayKokkos(num_elements_on_rank, "naive_mesh.local_to_global_elem_mapping"); + + for(int i = 0; i < num_nodes_on_rank; i++) { + naive_mesh.local_to_global_node_mapping.host(i) = nodes_on_rank[i]; + } + + for(int i = 0; i < num_elements_on_rank; i++) { + naive_mesh.local_to_global_elem_mapping.host(i) = elements_on_rank[i]; + } + + naive_mesh.local_to_global_node_mapping.update_device(); + naive_mesh.local_to_global_elem_mapping.update_device(); + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" Starting reverse mapping of the element-node connectivity from the global node ids to the local node ids"< vertloctab: + * CSR array [size vertlocnbr+1]: for each local vertex i, vertloctab[i] + * gives the index in edgeloctab where the neighbor list of vertex i begins. + * PT-Scotch expects this array to be of size vertlocnbr+1, where the difference + * vertloctab[i+1] - vertloctab[i] gives the number of edges for vertex i. + * + * - std::vector edgeloctab: + * CSR array [variable size]: a flattened list of *neighboring element global IDs*, + * in no particular order. For vertex i, its neighbors are located at + * edgeloctab[vertloctab[i]...vertloctab[i+1]-1]. + * In this compact CSR, these are global IDs (GIDs), enabling PT-Scotch to + * recognize edges both within and across ranks. + * + * - std::map elem_gid_to_offset: + * Helper map: For a given element global ID, gives the starting offset in + * the flattened neighbor array (elems_in_elem_on_rank) where this element's + * list of neighbor GIDs begins. This allows efficient neighbor list lookup. + * + * - (other arrays used, from mesh setup and communication phase) + * - elements_on_rank: vector of global element IDs owned by this rank. + * - num_elements_on_rank: number of owned elements. + * - num_elems_in_elem_per_rank: array, for each owned element, how many + * neighbors it has. + * - elems_in_elem_on_rank: flattened array of global neighbor IDs for all local elements. + * + **********************************************************************************/ + + // --- Step 1: Initialize the PT-Scotch distributed graph object on this MPI rank --- + SCOTCH_Dgraph dgraph; + if (SCOTCH_dgraphInit(&dgraph, MPI_COMM_WORLD) != 0) { + std::cerr << "[rank " << rank << "] SCOTCH_dgraphInit failed\n"; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + // Set base value for numbering (0 for C-style arrays) + const SCOTCH_Num baseval = 0; + + // vertlocnbr: Number of elements (vertices) that are local to this MPI rank + const SCOTCH_Num vertlocnbr = static_cast(naive_mesh.num_elems); + + // vertlocmax: Maximum possible local vertices (no holes, so identical to vertlocnbr) + const SCOTCH_Num vertlocmax = vertlocnbr; + + // --- Step 2: Build compact CSR arrays for PT-Scotch (vertloctab, edgeloctab) --- + // vertloctab: for each local mesh element [vertex], gives index in edgeloctab where its neighbor list begins + std::vector vertloctab(vertlocnbr + 1); + + // edgeloctab: flat array of neighbor global IDs for all local elements, built in order + std::vector edgeloctab; + edgeloctab.reserve(vertlocnbr * 6); // heuristic: assume typical mesh degree is ~6, for performance + + // Construct a map from element GID to its offset into elems_in_elem_on_rank (the array of neighbor GIDs) + // This allows, for a given element GID, quick lookup of where its neighbor list starts in the flat array. + std::map elem_gid_to_offset; + size_t current_offset = 0; + for (size_t k = 0; k < num_elements_on_rank; k++) { + elem_gid_to_offset[elements_on_rank[k]] = current_offset; + current_offset += num_elems_in_elem_per_rank[k]; + } + + // --- Step 3: Fill in the CSR arrays, looping over each locally-owned element --- + SCOTCH_Num offset = 0; // running count of edges encountered + + for (size_t lid = 0; lid < naive_mesh.num_elems; ++lid) { + + // Record current edge offset for vertex lid in vertloctab + vertloctab[lid] = offset; + + // Obtain this local element's global ID (from mapping) + int elem_gid = naive_mesh.local_to_global_elem_mapping.host(lid); + + // Find offset in the flattened neighbor array for this element's neighbor list + size_t elems_in_elem_offset = elem_gid_to_offset[elem_gid]; + + // For this element, find the count of its neighbors + // This requires finding its index in the elements_on_rank array + size_t idx = 0; + for (size_t k = 0; k < num_elements_on_rank; k++) { + if (elements_on_rank[k] == elem_gid) { + idx = k; + break; + } + } + size_t num_nbrs = num_elems_in_elem_per_rank[idx]; + + // Append each neighbor (by its GLOBAL elem GID) to edgeloctab + for (size_t j = 0; j < num_nbrs; j++) { + size_t neighbor_gid = elems_in_elem_on_rank[elems_in_elem_offset + j]; // This is a global element ID! + edgeloctab.push_back(static_cast(neighbor_gid)); + ++offset; // Increment running edge count + } + } + + // vertloctab[vertlocnbr] stores total number of edges written, finalizes the CSR structure + vertloctab[vertlocnbr] = offset; + + // edgelocnbr/edgelocsiz: Number of edge endpoints defined locally + // (PT-Scotch's distributed graphs allow edges to be replicated or owned by either endpoint) + const SCOTCH_Num edgelocnbr = offset; // total number of edge endpoints (sum of all local neighbor degrees) + const SCOTCH_Num edgelocsiz = edgelocnbr; // allocated size matches number of endpoints + + // Optionally print graph structure for debugging/validation + if (print_info) { + std::cout << "Rank " << rank << ": vertlocnbr = # of local elements(vertices) = " << vertlocnbr + << ", edgelocnbr = # of local edge endpoints = " << edgelocnbr << std::endl; + std::cout << "vertloctab (CSR row offsets): "; + for (size_t i = 0; i <= vertlocnbr; i++) { + std::cout << vertloctab[i] << " "; + } + std::cout << std::endl; + std::cout << "edgeloctab (first 20 neighbor GIDs): "; + for (size_t i = 0; i < std::min((size_t)20, edgeloctab.size()); i++) { + std::cout << edgeloctab[i] << " "; + } + std::cout << std::endl; + } + MPI_Barrier(MPI_COMM_WORLD); + + /************************************************************************** + * Step 4: Build the distributed graph using PT-Scotch's SCOTCH_dgraphBuild + * + * - PT-Scotch will use our CSR arrays. Since we use compact representation, + * most optional arrays ("veloloctab", "vlblloctab", "edgegsttab", "edloloctab") + * can be passed as nullptr. + * - edgeloctab contains *GLOBAL element GIDs* of neighbors. PT-Scotch uses this + * to discover connections across processor boundaries, so you do not have to + * encode ownership or partition information yourself. + **************************************************************************/ + int rc = SCOTCH_dgraphBuild( + &dgraph, + baseval, // start index (0) + vertlocnbr, // local vertex count (local elements) + vertlocmax, // local vertex max (no holes) + vertloctab.data(), // row offsets in edgeloctab + /*vendloctab*/ nullptr, // end of row offsets (compact CSR => nullptr) + /*veloloctab*/ nullptr, // vertex weights, not used + /*vlblloctab*/ nullptr, // vertex global labels (we use GIDs in edgeloctab) + edgelocnbr, // local edge endpoints count + edgelocsiz, // size of edge array + edgeloctab.data(), // global neighbor IDs for each local node + /*edgegsttab*/ nullptr, // ghost edge array, not used + /*edloloctab*/ nullptr // edge weights, not used + ); + if (rc != 0) { + std::cerr << "[rank " << rank << "] SCOTCH_dgraphBuild failed rc=" << rc << "\n"; + SCOTCH_dgraphFree(&dgraph); + MPI_Abort(MPI_COMM_WORLD, rc); + } + + // Optionally, print rank summary after graph build for further validation + if (print_info) { + SCOTCH_Num vertlocnbr_out; + SCOTCH_dgraphSize(&dgraph, &vertlocnbr_out, nullptr, nullptr, nullptr); + std::cout << "Rank " << rank << ": After dgraphBuild, vertlocnbr = " << vertlocnbr_out << std::endl; + } + MPI_Barrier(MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" Finished building the distributed graph using PT-Scotch"<(world_size)); + + + + + // ===================== PT-Scotch Strategy Selection and Documentation ====================== + // The PT-Scotch "strategy" (stratdat here) controls the algorithms and heuristics used for partitioning. + // You can specify a string or build a strategy using functions that adjust speed, quality, and recursion. + // + // Common strategy flags (see "scotch.h", "ptscotch.h", and PT-Scotch documentation): + // + // - SCOTCH_STRATDEFAULT: Use the default (fast, reasonable quality) partitioning strategy. + // Useful for quick, generic partitions where quality is not critical. + // + // - SCOTCH_STRATSPEED: Aggressively maximizes speed (at the cost of cut quality). + // For large runs or test runs where speed is more important than minimizing edgecut. + // + // - SCOTCH_STRATQUALITY: Prioritizes partition *quality* (minimizing edge cuts, maximizing load balance). + // Slower than the default. Use when high-quality partitioning is desired. + // + // - SCOTCH_STRATBALANCE: Tradeoff between speed and quality for balanced workload across partitions. + // Use if load balance is more critical than cut size. + // + // Additional Options: + // - Strategy can also be specified as a string (see Scotch manual, e.g., "b{sep=m{...} ...}"). + // - Recursion count parameter (here, set to 0) controls strategy recursion depth (0 = automatic). + // - Imbalance ratio (here, 0.01) allows minor imbalance in part weight for better cut quality. + // + // Example usage: + // SCOTCH_stratDgraphMapBuild(&strat, SCOTCH_STRATQUALITY, nparts, 0, 0.01); + // ^ quality-focused, nparts=number of parts/ranks + // SCOTCH_stratDgraphMapBuild(&strat, SCOTCH_STRATSPEED, nparts, 0, 0.05); + // ^ speed-focused, allow 5% imbalance + // + // Reference: + // - https://gitlab.inria.fr/scotch/scotch/-/blob/master/doc/libptscotch.pdf + // - SCOTCH_stratDgraphMapBuild() and related "strategy" documentation. + // + // --------------- Set up the desired partitioning strategy here: --------------- + SCOTCH_Strat stratdat; // PT-Scotch strategy object: holds partitioning options/settings + SCOTCH_stratInit(&stratdat); + + // Select partitioning strategy for this run: + // Use SCOTCH_STRATQUALITY for best cut quality. + // To change: replace with SCOTCH_STRATDEFAULT, SCOTCH_STRATSPEED, or SCOTCH_STRATBALANCE as discussed above. + // Arguments: (strategy object, strategy flag, #parts, recursion (0=auto), imbalance ratio) + SCOTCH_stratDgraphMapBuild(&stratdat, SCOTCH_STRATQUALITY, world_size, 0, 0.01); + + // partloctab: output array mapping each local element (vertex) to a *target partition number* + // After partitioning, partloctab[i] gives the part-assignment (in [0,world_size-1]) for local element i. + std::vector partloctab(vertlocnbr); + rc = SCOTCH_dgraphMap(&dgraph, &archdat, &stratdat, partloctab.data()); + if (rc != 0) { + std::cerr << "[rank " << rank << "] SCOTCH_dgraphMap failed rc=" << rc << "\n"; + SCOTCH_stratExit(&stratdat); + SCOTCH_archExit(&archdat); + SCOTCH_dgraphFree(&dgraph); + MPI_Abort(MPI_COMM_WORLD, rc); + } + + // Clean up PT-Scotch strategy and architecture objects + SCOTCH_stratExit(&stratdat); + SCOTCH_archExit(&archdat); + + // Free the graph now that we have the partition assignments + SCOTCH_dgraphFree(&dgraph); + + /*************************************************************************** + * Step 7 (Optional): Print out the partitioning assignment per element + * - Each local element's local index lid and global ID (gid) are listed with the + * part to which PT-Scotch has assigned them. + ***************************************************************************/ + print_info = false; + for(int rank_id = 0; rank_id < world_size; rank_id++) { + if(rank_id == rank && print_info) { + for (size_t lid = 0; lid < naive_mesh.num_elems; ++lid) { + size_t gid = naive_mesh.local_to_global_elem_mapping.host(lid); + std::cout << "[rank " << rank_id << "] elem_local=" << lid << " gid=" << gid + << " -> part=" << partloctab[lid] << "\n"; + } + MPI_Barrier(MPI_COMM_WORLD); + } + MPI_Barrier(MPI_COMM_WORLD); + } + print_info = false; + + + + +// ****************************************************************************************** +// Build the final mesh from the repartition +// ****************************************************************************************** + + + + MPI_Barrier(MPI_COMM_WORLD); + if (rank == 0) std::cout << "\n=== Starting Mesh Redistribution Phase ===\n"; + MPI_Barrier(MPI_COMM_WORLD); + + // -------------- Phase 1: Determine elements to send to each rank -------------- + std::vector> elems_to_send(world_size); + for (int lid = 0; lid < naive_mesh.num_elems; ++lid) { + int dest = static_cast(partloctab[lid]); + int elem_gid = static_cast(naive_mesh.local_to_global_elem_mapping.host(lid)); + elems_to_send[dest].push_back(elem_gid); + } + + // -------------- Phase 2: Exchange element GIDs -------------- + std::vector sendcounts(world_size), recvcounts(world_size); + for (int r = 0; r < world_size; ++r) + sendcounts[r] = static_cast(elems_to_send[r].size()); + + MPI_Alltoall(sendcounts.data(), 1, MPI_INT, recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); + + // Compute displacements + std::vector sdispls(world_size), rdispls(world_size); + int send_total = 0, recv_total = 0; + for (int r = 0; r < world_size; ++r) { + sdispls[r] = send_total; + rdispls[r] = recv_total; + send_total += sendcounts[r]; + recv_total += recvcounts[r]; + } + + + // Flatten send buffer + std::vector sendbuf; + sendbuf.reserve(send_total); + for (int r = 0; r < world_size; ++r) + sendbuf.insert(sendbuf.end(), elems_to_send[r].begin(), elems_to_send[r].end()); + + // Receive new local element GIDs + std::vector recvbuf(recv_total); + MPI_Alltoallv(sendbuf.data(), sendcounts.data(), sdispls.data(), MPI_INT, + recvbuf.data(), recvcounts.data(), rdispls.data(), MPI_INT, MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" Finished exchanging element GIDs"< new_elem_gids = recvbuf; + int num_new_elems = static_cast(new_elem_gids.size()); + + + if (print_info) { + std::cout << "[rank " << rank << "] new elems: " << num_new_elems << std::endl; + } + + // -------------- Phase 3: Send element–node connectivity -------------- + int nodes_per_elem = naive_mesh.num_nodes_in_elem; + + // Flatten element-node connectivity by global node IDs + std::vector conn_sendbuf; + for (int r = 0; r < world_size; ++r) { + for (int gid : elems_to_send[r]) { + // find local element lid from gid + int lid = -1; + for (int i = 0; i < naive_mesh.num_elems; ++i) + if (naive_mesh.local_to_global_elem_mapping.host(i) == gid) { lid = i; break; } + + for (int j = 0; j < nodes_per_elem; j++) { + int node_lid = naive_mesh.nodes_in_elem.host(lid, j); + int node_gid = naive_mesh.local_to_global_node_mapping.host(node_lid); + conn_sendbuf.push_back(node_gid); + } + } + } + + // element-node connectivity counts (ints per dest rank) + std::vector conn_sendcounts(world_size), conn_recvcounts(world_size); + for (int r = 0; r < world_size; ++r) + conn_sendcounts[r] = sendcounts[r] * nodes_per_elem; + + MPI_Alltoall(conn_sendcounts.data(), 1, MPI_INT, conn_recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD); + + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" Finished exchanging element–node connectivity counts"< conn_sdispls(world_size), conn_rdispls(world_size); + int conn_send_total = 0, conn_recv_total = 0; + for (int r = 0; r < world_size; ++r) { + conn_sdispls[r] = conn_send_total; + conn_rdispls[r] = conn_recv_total; + conn_send_total += conn_sendcounts[r]; + conn_recv_total += conn_recvcounts[r]; + } + + std::vector conn_recvbuf(conn_recv_total); + MPI_Alltoallv(conn_sendbuf.data(), conn_sendcounts.data(), conn_sdispls.data(), MPI_INT, + conn_recvbuf.data(), conn_recvcounts.data(), conn_rdispls.data(), MPI_INT, MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" Finished exchanging element–node connectivity"< node_gid_set(conn_recvbuf.begin(), conn_recvbuf.end()); + std::vector new_node_gids(node_gid_set.begin(), node_gid_set.end()); + int num_new_nodes = static_cast(new_node_gids.size()); + + // Build map gid→lid + std::unordered_map node_gid_to_lid; + for (int i = 0; i < num_new_nodes; ++i) + node_gid_to_lid[new_node_gids[i]] = i; + + if (print_info) + std::cout << "[rank " << rank << "] owns " << num_new_nodes << " unique nodes\n"; + + + // -------------- Phase 5: Request node coordinates -------------- + std::vector node_coords_sendbuf; + for (int r = 0; r < world_size; ++r) { + for (int gid : elems_to_send[r]) { + int lid = -1; + for (int i = 0; i < naive_mesh.num_elems; ++i) + if (naive_mesh.local_to_global_elem_mapping.host(i) == gid) { lid = i; break; } + + for (int j = 0; j < nodes_per_elem; j++) { + int node_lid = naive_mesh.nodes_in_elem.host(lid, j); + int node_gid = naive_mesh.local_to_global_node_mapping.host(node_lid); + + node_coords_sendbuf.push_back(naive_node.coords.host(node_lid, 0)); + node_coords_sendbuf.push_back(naive_node.coords.host(node_lid, 1)); + node_coords_sendbuf.push_back(naive_node.coords.host(node_lid, 2)); + } + } + } + + // Each node is 3 doubles; same sendcounts scaling applies + std::vector coord_sendcounts(world_size), coord_recvcounts(world_size); + for (int r = 0; r < world_size; ++r) + coord_sendcounts[r] = sendcounts[r] * nodes_per_elem * 3; + + MPI_Alltoall(coord_sendcounts.data(), 1, MPI_INT, coord_recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" Finished exchanging node coordinates counts"< coord_sdispls(world_size), coord_rdispls(world_size); + int coord_send_total = 0, coord_recv_total = 0; + for (int r = 0; r < world_size; ++r) { + coord_sdispls[r] = coord_send_total; + coord_rdispls[r] = coord_recv_total; + coord_send_total += coord_sendcounts[r]; + coord_recv_total += coord_recvcounts[r]; + } + + std::vector coord_recvbuf(coord_recv_total); + MPI_Alltoallv(node_coords_sendbuf.data(), coord_sendcounts.data(), coord_sdispls.data(), MPI_DOUBLE, + coord_recvbuf.data(), coord_recvcounts.data(), coord_rdispls.data(), MPI_DOUBLE, MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" Finished exchanging node coordinates"<(num_new_nodes); + intermediate_mesh.local_to_global_elem_mapping = DCArrayKokkos(num_new_elems); + + // Fill global mappings + for (int i = 0; i < num_new_nodes; ++i) + intermediate_mesh.local_to_global_node_mapping.host(i) = new_node_gids[i]; + for (int i = 0; i < num_new_elems; ++i) + intermediate_mesh.local_to_global_elem_mapping.host(i) = new_elem_gids[i]; + + intermediate_mesh.local_to_global_node_mapping.update_device(); + intermediate_mesh.local_to_global_elem_mapping.update_device(); + + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" Starting reverse mapping of the element-node connectivity from the global node ids to the local node ids"<> node_gid_to_coords; + int coord_idx = 0; + for (int e = 0; e < num_new_elems; ++e) { + for (int j = 0; j < nodes_per_elem; j++) { + int node_gid = conn_recvbuf[e * nodes_per_elem + j]; + if (node_gid_to_coords.find(node_gid) == node_gid_to_coords.end()) { + node_gid_to_coords[node_gid] = { + coord_recvbuf[coord_idx*3 + 0], + coord_recvbuf[coord_idx*3 + 1], + coord_recvbuf[coord_idx*3 + 2] + }; + } + coord_idx++; + } + } + + // Now fill coordinates in node order + intermediate_node.initialize(num_new_nodes, 3, {node_state::coords}); + for (int i = 0; i < num_new_nodes; ++i) { + int node_gid = new_node_gids[i]; + auto it = node_gid_to_coords.find(node_gid); + if (it != node_gid_to_coords.end()) { + intermediate_node.coords.host(i, 0) = it->second[0]; + intermediate_node.coords.host(i, 1) = it->second[1]; + intermediate_node.coords.host(i, 2) = it->second[2]; + } + } + intermediate_node.coords.update_device(); + + // Connectivity rebuild + intermediate_mesh.build_connectivity(); + MPI_Barrier(MPI_COMM_WORLD); + + + +// ****************************************************************************************** +// Build the ghost elements and nodes +// ****************************************************************************************** + + double t_ghost_start = MPI_Wtime(); + + // First, gather the number of elements each rank owns + std::vector elem_counts(world_size); + + // int MPI_Allgather( + // const void* sendbuf, // Data to send from this process + // int sendcount, // Number of elements to send + // MPI_Datatype sendtype, // Type of send data + // void* recvbuf, // Buffer to receive all data + // int recvcount, // Number of elements to receive from each process + // MPI_Datatype recvtype, // Type of receive data + // MPI_Comm comm // Communicator + // ); + MPI_Allgather(&intermediate_mesh.num_elems, 1, MPI_INT, elem_counts.data(), 1, MPI_INT, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + // Compute displacements + std::vector elem_displs(world_size); + int total_elems = 0; + for (int r = 0; r < world_size; ++r) { + elem_displs[r] = total_elems; + total_elems += elem_counts[r]; + } + + // Gather all element GIDs from all ranks + std::vector all_elem_gids(total_elems); + + // int MPI_Allgatherv( + // const void* sendbuf, // Data to send from this process + // int sendcount, // Number of elements THIS process sends + // MPI_Datatype sendtype, // Type of send data + // void* recvbuf, // Buffer to receive all data + // const int* recvcounts, // Array: number of elements from each process + // const int* displs, // Array: displacement for each process's data + // MPI_Datatype recvtype, // Type of receive data + // MPI_Comm comm // Communicator + // ); + MPI_Allgatherv(intermediate_mesh.local_to_global_elem_mapping.host_pointer(), intermediate_mesh.num_elems, MPI_UNSIGNED_LONG_LONG, + all_elem_gids.data(), elem_counts.data(), elem_displs.data(), + MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + // Build a map: element GID -> owning rank + std::map elem_gid_to_rank; + for (int r = 0; r < world_size; ++r) { + for (int i = 0; i < elem_counts[r]; ++i) { + size_t gid = all_elem_gids[elem_displs[r] + i]; + elem_gid_to_rank[gid] = r; + } + } + + // Strategy: Find ghost elements by checking neighbors of our boundary elements. + // A boundary element is one that has a neighbor owned by another rank. + // However, since build_connectivity() only includes locally-owned elements, + // we need to use a different approach: find elements on other ranks that share + // nodes with our locally-owned elements. + + // First, collect all nodes that belong to our locally-owned elements + std::set local_elem_nodes; + for (int lid = 0; lid < num_new_elems; ++lid) { + for (int j = 0; j < nodes_per_elem; j++) { + size_t node_lid = intermediate_mesh.nodes_in_elem.host(lid, j); + size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid); + local_elem_nodes.insert(node_gid); + } + } + + // Now collect element-to-node connectivity to send to all ranks + // Format: for each element, list its node GIDs (each entry is a pair: elem_gid, node_gid) + std::vector elem_node_conn; + int local_conn_size = 0; + + for (int lid = 0; lid < num_new_elems; ++lid) { + size_t elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(lid); + for (int j = 0; j < nodes_per_elem; j++) { + size_t node_lid = intermediate_mesh.nodes_in_elem.host(lid, j); + size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid); + elem_node_conn.push_back(elem_gid); + elem_node_conn.push_back(node_gid); + } + local_conn_size += nodes_per_elem * 2; // Each pair is 2 size_ts + } + + // Exchange element-node connectivity with all ranks using Allgather + // First, gather the sizes from each rank + std::vector conn_sizes(world_size); + MPI_Allgather(&local_conn_size, 1, MPI_INT, conn_sizes.data(), 1, MPI_INT, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + // Compute displacements + std::vector conn_displs(world_size); + int total_conn = 0; + for (int r = 0; r < world_size; ++r) { + conn_displs[r] = total_conn; + total_conn += conn_sizes[r]; + } + + // Gather all element-node pairs from all ranks + std::vector all_conn(total_conn); + MPI_Allgatherv(elem_node_conn.data(), local_conn_size, MPI_UNSIGNED_LONG_LONG, + all_conn.data(), conn_sizes.data(), conn_displs.data(), + MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + // create a set for local_elem_gids + std::set local_elem_gids; + for (int i = 0; i < num_new_elems; ++i) { + local_elem_gids.insert(intermediate_mesh.local_to_global_elem_mapping.host(i)); + } + + // Build a map: node GID -> set of element GIDs that contain it (from other ranks) + std::map> node_to_ext_elem; + for (int r = 0; r < world_size; ++r) { + if (r == rank) continue; // Skip our own data + // Process pairs from rank r: conn_sizes[r] is in units of size_ts, so num_pairs = conn_sizes[r] / 2 + int num_pairs = conn_sizes[r] / 2; + for (int i = 0; i < num_pairs; ++i) { + // Each pair is 2 size_ts, starting at conn_displs[r] + int offset = conn_displs[r] + i * 2; + size_t elem_gid = all_conn[offset]; + size_t node_gid = all_conn[offset + 1]; + + // If this node is in one of our elements, then the element is a potential ghost + if (local_elem_nodes.find(node_gid) != local_elem_nodes.end()) { + // Check if this element is not owned by us + if (local_elem_gids.find(elem_gid) == local_elem_gids.end()) { + node_to_ext_elem[node_gid].insert(elem_gid); + } + } + } + } + + // Collect all unique ghost element GIDs + std::set ghost_elem_gids; + for (const auto& pair : node_to_ext_elem) { + for (size_t elem_gid : pair.second) { + ghost_elem_gids.insert(elem_gid); + } + } + + // Additional check: elements that are neighbors of our locally-owned elements + // but are owned by other ranks (these might already be in ghost_elem_gids, but check connectivity) + + for (int lid = 0; lid < num_new_elems; ++lid) { + size_t num_neighbors = intermediate_mesh.num_elems_in_elem(lid); + + for (size_t nbr_idx = 0; nbr_idx < num_neighbors; ++nbr_idx) { + size_t neighbor_lid = intermediate_mesh.elems_in_elem(lid, nbr_idx); + + if (neighbor_lid < static_cast(num_new_elems)) { + size_t neighbor_gid = intermediate_mesh.local_to_global_elem_mapping(neighbor_lid); + + // Check if neighbor is owned by this rank + auto it = elem_gid_to_rank.find(neighbor_gid); + if (it != elem_gid_to_rank.end() && it->second != rank) { + // Neighbor is owned by another rank - it's a ghost for us + ghost_elem_gids.insert(neighbor_gid); + } + } + } + } + + // Count unique ghost elements + intermediate_mesh.num_ghost_elems = ghost_elem_gids.size(); + + MPI_Barrier(MPI_COMM_WORLD); + double t_ghost_end = MPI_Wtime(); + + if (rank == 0) { + std::cout << " Finished calculating ghost elements" << std::endl; + std::cout << " Ghost element calculation took " << (t_ghost_end - t_ghost_start) << " seconds." << std::endl; + } + + // Print ghost element info if requested + print_info = false; + for(int i = 0; i < world_size; i++) { + MPI_Barrier(MPI_COMM_WORLD); + if(rank == i && print_info) { + std::cout << "[rank " << rank << "] owns " << num_new_elems + << " elements and has " << intermediate_mesh.num_ghost_elems << " ghost elements" << std::endl; + std::cout << "[rank " << rank << "] owned element global IDs: "; + for (int j = 0; j < intermediate_mesh.num_elems; j++) { + std::cout << intermediate_mesh.local_to_global_elem_mapping(j) << " "; + } + + // Print global IDs of ghost elements + std::cout << std::endl << "[rank " << rank << "] ghost element global IDs: "; + for (const auto& gid : ghost_elem_gids) { + std::cout << gid << " "; + } + std::cout << std::endl; + } + + MPI_Barrier(MPI_COMM_WORLD); + } + + + + // Build the connectivity that includes ghost elements + // Create an extended mesh with owned elements first, then ghost elements appended + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout << " Starting to build extended mesh with ghost elements" << std::endl; + + // Step 1: Extract ghost element-node connectivity from all_conn + // Build a map: ghost_elem_gid -> vector of node_gids (ordered as in all_conn) + std::map> ghost_elem_to_nodes; + for (const size_t& ghost_gid : ghost_elem_gids) { + ghost_elem_to_nodes[ghost_gid].reserve(nodes_per_elem); + } + + // Extract nodes for each ghost element from all_conn + // The all_conn array has pairs (elem_gid, node_gid) for each rank's elements + for (int r = 0; r < world_size; ++r) { + if (r == rank) continue; // Skip our own data (we already have owned element connectivity) + int num_pairs = conn_sizes[r] / 2; + + // Process pairs in order - each element's nodes are contiguous + for (int i = 0; i < num_pairs; ++i) { + int offset = conn_displs[r] + i * 2; + size_t elem_gid = all_conn[offset]; + size_t node_gid = all_conn[offset + 1]; + + // If this is one of our ghost elements, record its node (in order) + auto it = ghost_elem_to_nodes.find(elem_gid); + if (it != ghost_elem_to_nodes.end()) { + it->second.push_back(node_gid); + } + } + } + + // Verify each ghost element has the correct number of nodes + for (auto& pair : ghost_elem_to_nodes) { + if (pair.second.size() != static_cast(nodes_per_elem)) { + std::cerr << "[rank " << rank << "] ERROR: Ghost element " << pair.first + << " has " << pair.second.size() << " nodes, expected " << nodes_per_elem << std::endl; + } + } + + // Step 2: Build extended node list (owned nodes first, then ghost-only nodes) + // Start with owned nodes + std::map node_gid_to_extended_lid; + int extended_node_lid = 0; + + // Add all owned nodes + for (int i = 0; i < intermediate_mesh.num_nodes; ++i) { + size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(i); + node_gid_to_extended_lid[node_gid] = extended_node_lid++; + } + + // Add ghost-only nodes (nodes that belong to ghost elements but not to owned elements) + std::set ghost_only_nodes; + for (const auto& pair : ghost_elem_to_nodes) { + for (size_t node_gid : pair.second) { + // Check if we already have this node + if (node_gid_to_extended_lid.find(node_gid) == node_gid_to_extended_lid.end()) { + ghost_only_nodes.insert(node_gid); + } + } + } + + // Assign extended local IDs to ghost-only nodes + for (size_t node_gid : ghost_only_nodes) { + node_gid_to_extended_lid[node_gid] = extended_node_lid++; + } + + int total_extended_nodes = extended_node_lid; + + // Step 3: Prepare requests for ghost node coordinates from owning ranks (if needed later) + // Build request list: for each ghost node, find an owning rank via any ghost element that contains it + std::map> rank_to_ghost_node_requests; + for (size_t node_gid : ghost_only_nodes) { + // Find which rank owns an element containing this node + // Look through ghost elements + for (const auto& pair : ghost_elem_to_nodes) { + size_t ghost_elem_gid = pair.first; + const std::vector& nodes = pair.second; + bool found = false; + for (size_t ngid : nodes) { + if (ngid == node_gid) { + found = true; + break; + } + } + if (found) { + auto owner_it = elem_gid_to_rank.find(ghost_elem_gid); + if (owner_it != elem_gid_to_rank.end()) { + rank_to_ghost_node_requests[owner_it->second].push_back(node_gid); + break; + } + } + } + } + + // Step 4: Build extended element list and node connectivity + // Owned elements: 0 to num_new_elems-1 (already have these) + // Ghost elements: num_new_elems to num_new_elems + num_ghost_elems - 1 + + // Create extended element-node connectivity array + int total_extended_elems = intermediate_mesh.num_elems + intermediate_mesh.num_ghost_elems; + std::vector> extended_nodes_in_elem(total_extended_elems); + + // Copy owned element connectivity (convert to extended node LIDs) + for (int lid = 0; lid < intermediate_mesh.num_elems; ++lid) { + extended_nodes_in_elem[lid].reserve(nodes_per_elem); + for (int j = 0; j < nodes_per_elem; j++) { + size_t node_lid = intermediate_mesh.nodes_in_elem.host(lid, j); + size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid); + int ext_lid = node_gid_to_extended_lid[node_gid]; + extended_nodes_in_elem[lid].push_back(ext_lid); + } + } + + // Add ghost element connectivity (map ghost node GIDs to extended node LIDs) + int ghost_elem_ext_lid = intermediate_mesh.num_elems; + std::vector ghost_elem_gids_ordered(ghost_elem_gids.begin(), ghost_elem_gids.end()); + std::sort(ghost_elem_gids_ordered.begin(), ghost_elem_gids_ordered.end()); + + for (size_t ghost_gid : ghost_elem_gids_ordered) { + auto it = ghost_elem_to_nodes.find(ghost_gid); + if (it == ghost_elem_to_nodes.end()) continue; + + extended_nodes_in_elem[ghost_elem_ext_lid].reserve(nodes_per_elem); + for (size_t node_gid : it->second) { + int ext_lid = node_gid_to_extended_lid[node_gid]; + extended_nodes_in_elem[ghost_elem_ext_lid].push_back(ext_lid); + } + ghost_elem_ext_lid++; + } + + MPI_Barrier(MPI_COMM_WORLD); + // Sequential rank-wise printing of extended mesh structure info + for (int r = 0; r < world_size; ++r) { + if (rank == r) { + std::cout << "[rank " << rank << "] Finished building extended mesh structure" << std::endl; + std::cout << "[rank " << rank << "] - Owned elements: " << intermediate_mesh.num_elems << std::endl; + std::cout << "[rank " << rank << "] - Ghost elements: " << ghost_elem_gids.size() << std::endl; + std::cout << "[rank " << rank << "] - Total extended elements: " << total_extended_elems << std::endl; + std::cout << "[rank " << rank << "] - Owned nodes: " << intermediate_mesh.num_nodes << std::endl; + std::cout << "[rank " << rank << "] - Ghost-only nodes: " << ghost_only_nodes.size() << std::endl; + std::cout << "[rank " << rank << "] - Total extended nodes: " << total_extended_nodes << std::endl; + std::cout << std::flush; + } + MPI_Barrier(MPI_COMM_WORLD); + } + + // The extended_nodes_in_elem vector now contains the connectivity for both owned and ghost elements + // Each element's nodes are stored using extended local node IDs (0-based, contiguous) + + // Build reverse maps: extended_lid -> gid for nodes and elements + std::vector extended_lid_to_node_gid(total_extended_nodes); + for (const auto& pair : node_gid_to_extended_lid) { + extended_lid_to_node_gid[pair.second] = pair.first; + } + + // Build extended element GID list: owned first, then ghost + std::vector extended_lid_to_elem_gid(total_extended_elems); + // Owned elements + for (int i = 0; i < intermediate_mesh.num_elems; ++i) { + extended_lid_to_elem_gid[i] = intermediate_mesh.local_to_global_elem_mapping.host(i); + } + // Ghost elements (in sorted order) + for (size_t idx = 0; idx < ghost_elem_gids_ordered.size(); ++idx) { + extended_lid_to_elem_gid[intermediate_mesh.num_elems + idx] = ghost_elem_gids_ordered[idx]; + } + + + +// ****************************************************************************************** +// Build the final partitioned mesh +// ****************************************************************************************** + + + + + final_mesh.initialize_nodes(total_extended_nodes); + final_mesh.initialize_elems(total_extended_elems, 3); + final_mesh.local_to_global_node_mapping = DCArrayKokkos(total_extended_nodes); + final_mesh.local_to_global_elem_mapping = DCArrayKokkos(total_extended_elems); + for (int i = 0; i < total_extended_nodes; i++) { + final_mesh.local_to_global_node_mapping.host(i) = extended_lid_to_node_gid[i]; + } + for (int i = 0; i < total_extended_elems; i++) { + final_mesh.local_to_global_elem_mapping.host(i) = extended_lid_to_elem_gid[i]; + } + final_mesh.local_to_global_node_mapping.update_device(); + final_mesh.local_to_global_elem_mapping.update_device(); + + final_mesh.num_ghost_elems = ghost_elem_gids.size(); + final_mesh.num_ghost_nodes = ghost_only_nodes.size(); + + // Set owned counts for write_vtk (excludes ghost elements/nodes) + final_mesh.num_owned_elems = intermediate_mesh.num_elems; + final_mesh.num_owned_nodes = intermediate_mesh.num_nodes; + + + // Print num ghost elements and nodes on each rank sequentially + for (int r = 0; r < world_size; ++r) { + if (rank == r) { + std::cout << "*******[rank " << rank << "] - Ghost elements: " << final_mesh.num_ghost_elems << std::endl; + std::cout << "*******[rank " << rank << "] - Ghost-only nodes: " << final_mesh.num_ghost_nodes << std::endl; + } + MPI_Barrier(MPI_COMM_WORLD); + } + + + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" Starting reverse mapping of the element-node connectivity from the global node ids to the local node ids"< coordinate. + // 3. Use this map to fill final_node.coords. + + // 1. Build list of all global node IDs needed on this rank (owned + ghosts) + std::vector all_needed_node_gids(total_extended_nodes); + for (int i = 0; i < total_extended_nodes; ++i) { + all_needed_node_gids[i] = final_mesh.local_to_global_node_mapping.host(i); + } + + // 2. Build owned node GIDs and their coordinates + std::vector owned_gids(intermediate_mesh.num_nodes); + for (int i = 0; i < owned_gids.size(); ++i) + owned_gids[i] = intermediate_mesh.local_to_global_node_mapping.host(i); + + // 3. Gather all GIDs in the world that are needed anywhere (owned or ghosted, by any rank) + // so we can distribute the needed coordinate data. + // The easiest is to Allgather everyone's "owned_gids" and coords + + int local_owned_count = static_cast(owned_gids.size()); + std::vector owned_counts(world_size, 0); + if (local_owned_count < 0) local_owned_count = 0; // Clean up possibility of -1 + + // a) Gather counts + owned_counts.resize(world_size, 0); + MPI_Allgather(&local_owned_count, 1, MPI_INT, owned_counts.data(), 1, MPI_INT, MPI_COMM_WORLD); + + // b) Displacements and total + std::vector owned_displs(world_size,0); + int total_owned = 0; + for (int r=0; r all_owned_gids(total_owned); + MPI_Allgatherv(owned_gids.data(), local_owned_count, MPI_UNSIGNED_LONG_LONG, + all_owned_gids.data(), owned_counts.data(), owned_displs.data(), + MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); + + // d) Global coords (size: total_owned x 3) + std::vector owned_coords_send(3*local_owned_count, 0.0); + for (int i=0; i all_owned_coords(3 * total_owned, 0.0); + + // Create coordinate-specific counts and displacements (in units of doubles, not nodes) + std::vector coord_counts(world_size); + std::vector coord_displs(world_size); + for (int r=0; r coord[3] + std::unordered_map> gid_to_coord; + for (int i=0; i xyz = { + all_owned_coords[3*i+0], + all_owned_coords[3*i+1], + all_owned_coords[3*i+2] + }; + gid_to_coord[all_owned_gids[i]] = xyz; + } + + // 4. Finally, fill final_node.coords with correct coordinates. + for (int i = 0; i < total_extended_nodes; ++i) { + size_t gid = final_mesh.local_to_global_node_mapping.host(i); + auto it = gid_to_coord.find(gid); + if (it != gid_to_coord.end()) { + final_node.coords.host(i,0) = it->second[0]; + final_node.coords.host(i,1) = it->second[1]; + final_node.coords.host(i,2) = it->second[2]; + } else { + // Could happen if there's a bug: fill with zeros for safety + final_node.coords.host(i,0) = 0.0; + final_node.coords.host(i,1) = 0.0; + final_node.coords.host(i,2) = 0.0; + } + } + final_node.coords.update_device(); + + + // -------------------------------------------------------------------------------------- + // Build reverse map via global IDs: for each local element gid, find ranks that ghost it. + // Steps: + // 1) Each rank contributes its ghost element GIDs. + // 2) Allgatherv ghost GIDs to build gid -> [ranks that ghost it]. + // 3) For each locally-owned element gid, lookup ranks that ghost it and record targets. + // -------------------------------------------------------------------------------------- + std::vector>> boundary_elem_targets(intermediate_mesh.num_elems); + + // Prepare local ghost list as vector + std::vector ghost_gids_vec; + ghost_gids_vec.reserve(ghost_elem_gids.size()); + for (const auto &g : ghost_elem_gids) ghost_gids_vec.push_back(g); + + // Exchange counts + std::vector ghost_counts(world_size, 0); + int local_ghost_count = static_cast(ghost_gids_vec.size()); + MPI_Allgather(&local_ghost_count, 1, MPI_INT, ghost_counts.data(), 1, MPI_INT, MPI_COMM_WORLD); + + // Displacements and recv buffer + std::vector ghost_displs(world_size, 0); + int total_ghosts = 0; + for (int r = 0; r < world_size; ++r) { + ghost_displs[r] = total_ghosts; + total_ghosts += ghost_counts[r]; + } + std::vector all_ghost_gids(total_ghosts); + + // Gather ghost gids + MPI_Allgatherv(ghost_gids_vec.data(), local_ghost_count, MPI_UNSIGNED_LONG_LONG, + all_ghost_gids.data(), ghost_counts.data(), ghost_displs.data(), + MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout << " Finished gathering ghost element GIDs" << std::endl; + + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout << " Starting to build the reverse map for communication" << std::endl; + // Build map gid -> ranks that ghost it + std::unordered_map> gid_to_ghosting_ranks; + gid_to_ghosting_ranks.reserve(static_cast(total_ghosts)); + for (int r = 0; r < world_size; ++r) { + int cnt = ghost_counts[r]; + int off = ghost_displs[r]; + for (int i = 0; i < cnt; ++i) { + size_t g = all_ghost_gids[off + i]; + gid_to_ghosting_ranks[g].push_back(r); + } + } + + // For each local element, list destinations: ranks that ghost our gid + for (int elem_lid = 0; elem_lid < intermediate_mesh.num_elems; elem_lid++) { + size_t local_elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(elem_lid); + auto it = gid_to_ghosting_ranks.find(local_elem_gid); + if (it == gid_to_ghosting_ranks.end()) continue; + const std::vector &dest_ranks = it->second; + for (int rr : dest_ranks) { + if (rr == rank) continue; + boundary_elem_targets[elem_lid].push_back(std::make_pair(rr, local_elem_gid)); + } + } + + std::cout.flush(); + MPI_Barrier(MPI_COMM_WORLD); + // Optional: print a compact summary of reverse map for verification (limited output) + for(int i = 0; i < world_size; i++) { + if (rank == i && print_info) { + std::cout << std::endl; + for (int elem_lid = 0; elem_lid < intermediate_mesh.num_elems; elem_lid++) { + + size_t local_elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(elem_lid); + if (boundary_elem_targets[elem_lid].empty()) + { + std::cout << "[rank " << rank << "] " << "elem_lid: "<< elem_lid <<" - elem_gid: " << local_elem_gid << " sends to: no ghost elements" << std::endl; + } + else + { + std::cout << "[rank " << rank << "] " << "elem_lid: "<< elem_lid <<" - elem_gid: " << local_elem_gid << " sends to: "; + int shown = 0; + for (const auto &pr : boundary_elem_targets[elem_lid]) { + if (shown >= 12) { std::cout << " ..."; break; } + std::cout << "(r" << pr.first << ":gid " << pr.second << ") "; + shown++; + } + std::cout << std::endl; + } + } + std::cout.flush(); + } + MPI_Barrier(MPI_COMM_WORLD); + } + + print_info = false; + + + MPI_Barrier(MPI_COMM_WORLD); + + + + + +} + + + + + + + + + +#endif \ No newline at end of file diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp index 4cc6da9e..bc3e8371 100644 --- a/examples/mesh_decomp/mesh_decomp.cpp +++ b/examples/mesh_decomp/mesh_decomp.cpp @@ -1,59 +1,25 @@ -#include -#include -#include -#include -#include -#include -#include -#include +// #include +// #include +// #include +// #include +// #include +// #include +// #include +// #include -#include "mesh.h" -#include "state.h" -#include "mesh_io.h" +// #include "mesh.h" +// #include "state.h" +// #include "mesh_io.h" + +#include "decomp_utils.h" // Include Scotch headers #include "scotch.h" #include "ptscotch.h" - -void calc_elements_per_rank(std::vector& elems_per_rank, int num_elems, int world_size){ - // Compute elements to send to each rank; handle remainders for non-even distribution - std::fill(elems_per_rank.begin(), elems_per_rank.end(), num_elems / world_size); - int remainder = num_elems % world_size; - for (int i = 0; i < remainder; ++i) { - elems_per_rank[i] += 1; - } -} - -void print_rank_mesh_info(Mesh_t& mesh, int rank) { - - std::cout< elements_on_rank; - std::vector nodes_on_rank; - - - std::vector elems_per_rank(world_size); // number of elements to send to each rank size(world_size) - std::vector nodes_per_rank(world_size); // number of nodes to send to each rank size(world_size) - - // create a 2D vector of elements to send to each rank - std::vector> elements_to_send(world_size); - - // create a 2D vector of nodes to send to each rank - std::vector> nodes_to_send(world_size); - - // Create a 2D vector to hold the nodal positions on each rank - std::vector> node_pos_to_send(world_size); - - // create a 2D vector to hold the node positions on each rank - std::vector> node_pos_on_rank(world_size); - - // ******************************************************** // Build the initial mesh // ******************************************************** @@ -127,1889 +54,17 @@ int main(int argc, char** argv) { std::cout<<"Rank "< all_elements; - std::vector sendcounts(world_size); - std::vector displs(world_size); - - int displacement = 0; - for (int i = 0; i < world_size; i++) { - sendcounts[i] = elems_per_rank[i]; - displs[i] = displacement; - // Copy elements for rank i to the flattened array - for (int j = 0; j < elems_per_rank[i]; j++) { - all_elements.push_back(elements_to_send[i][j]); - } - displacement += elems_per_rank[i]; - } - - // Send the elements to each rank - MPI_Scatterv(all_elements.data(), sendcounts.data(), displs.data(), MPI_INT, - elements_on_rank.data(), num_elements_on_rank, MPI_INT, - 0, MPI_COMM_WORLD); - } - else { - MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT, - elements_on_rank.data(), num_elements_on_rank, MPI_INT, - 0, MPI_COMM_WORLD); - } - - MPI_Barrier(MPI_COMM_WORLD); - double t_scatter_gids_end = MPI_Wtime(); - if(rank == 0) { - std::cout<<" Finished scattering the actual element global ids to each rank"< nodes_set; - for (int j = 0; j < elems_per_rank[i]; j++) { - for (int k = 0; k < num_nodes_per_elem; k++) { - nodes_set.insert(initial_mesh.nodes_in_elem.host(elements_to_send[i][j], k)); - } - } - nodes_to_send[i] = std::vector(nodes_set.begin(), nodes_set.end()); - } - - for (int i = 0; i < world_size; i++) { - nodes_per_rank[i] = nodes_to_send[i].size(); - } - - if (print_info) { - - - std::cout< all_nodes; - std::vector sendcounts(world_size); - std::vector displs(world_size); - - int displacement = 0; - for (int i = 0; i < world_size; i++) { - sendcounts[i] = nodes_to_send[i].size(); - displs[i] = displacement; - // Copy nodes for rank i to the flattened array - for (int j = 0; j < nodes_to_send[i].size(); j++) { - all_nodes.push_back(nodes_to_send[i][j]); - } - displacement += nodes_to_send[i].size(); - } - // Send the nodes to each rank - // all_nodes.data(): Pointer to the flattened array of all nodes to be sent to each rank - // sendcounts.data(): Array with the number of nodes to send to each rank - // displs.data(): Array with the displacement for each rank in the flattened array - // MPI_INT: Data type of the nodes (integer) - // nodes_on_rank.data(): Pointer to the buffer where each rank will receive its nodes - // num_nodes_on_rank: Number of nodes that the receiving rank expects to receive - // MPI_INT: Data type of the receive buffer (integer) - // 0: The root rank (rank 0) that is performing the scatter - // MPI_COMM_WORLD: The communicator - MPI_Scatterv(all_nodes.data(), sendcounts.data(), displs.data(), MPI_INT, - nodes_on_rank.data(), num_nodes_on_rank, MPI_INT, - 0, MPI_COMM_WORLD); - } - else { - MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT, - nodes_on_rank.data(), num_nodes_on_rank, MPI_INT, - 0, MPI_COMM_WORLD); - } - - MPI_Barrier(MPI_COMM_WORLD); - - // Timer: End measuring time for scattering node global ids - double t_scatter_nodeids_end = MPI_Wtime(); - - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) { - std::cout<<" Finished scattering the actual node global ids to each rank"< node_pos_on_rank_flat(num_nodes_on_rank * 3); - - // Timer for scattering node positions - double t_scatter_nodepos_start = MPI_Wtime(); - - if(rank == 0) - { - for (int i = 0; i < world_size; i++) { - for(int node_gid = 0; node_gid < nodes_to_send[i].size(); node_gid++) - { - node_pos_to_send[i].push_back(initial_node.coords.host(nodes_to_send[i][node_gid], 0)); - node_pos_to_send[i].push_back(initial_node.coords.host(nodes_to_send[i][node_gid], 1)); - node_pos_to_send[i].push_back(initial_node.coords.host(nodes_to_send[i][node_gid], 2)); - } - } - - // Prepare data for MPI_Scatterv (scatter with variable counts) - // Flatten the 2D node_pos_to_send into a 1D array - std::vector all_node_pos; - std::vector sendcounts(world_size); - std::vector displs(world_size); - - int displacement = 0; - for (int i = 0; i < world_size; i++) { - sendcounts[i] = nodes_to_send[i].size() * 3; - displs[i] = displacement; // displacement is the starting index of the nodes for the current rank in the flattened array - // Copy node positions for rank i to the flattened array - for(int j = 0; j < nodes_to_send[i].size(); j++) { - for(int k = 0; k < 3; k++) { - all_node_pos.push_back(node_pos_to_send[i][j * 3 + k]); - } - } - displacement += nodes_to_send[i].size() * 3; - } - - // Send the node positions to each rank - MPI_Scatterv(all_node_pos.data(), sendcounts.data(), displs.data(), MPI_DOUBLE, - node_pos_on_rank_flat.data(), num_nodes_on_rank * 3, MPI_DOUBLE, - 0, MPI_COMM_WORLD); - } - else { - MPI_Scatterv(nullptr, nullptr, nullptr, MPI_DOUBLE, - node_pos_on_rank_flat.data(), num_nodes_on_rank * 3, MPI_DOUBLE, - 0, MPI_COMM_WORLD); - } - - MPI_Barrier(MPI_COMM_WORLD); - - if (rank == 0 && print_info) { - // Print out the node positions on this rank - std::cout << "Rank " << rank << " received node positions: "; - for (int i = 0; i < num_nodes_on_rank; i++) { - std::cout << "(" << node_pos_on_rank_flat[i*3] << ", " - << node_pos_on_rank_flat[i*3+1] << ", " - << node_pos_on_rank_flat[i*3+2] << ") "; - } - std::cout << std::endl; - } - - - MPI_Barrier(MPI_COMM_WORLD); - - if (rank == 1 && print_info) { - // Print out the node positions on this rank - std::cout << "Rank " << rank << " received node positions: "; - for (int i = 0; i < num_nodes_on_rank; i++) { - std::cout << "(" << node_pos_on_rank_flat[i*3] << ", " - << node_pos_on_rank_flat[i*3+1] << ", " - << node_pos_on_rank_flat[i*3+2] << ") "; - } - std::cout << std::endl; - } - - MPI_Barrier(MPI_COMM_WORLD); - - double t_scatter_nodepos_end = MPI_Wtime(); - if(rank == 0) { - std::cout<<" Finished scattering the node positions to each rank"< required_node_state = { node_state::coords }; - node.initialize(num_nodes_on_rank, 3, required_node_state); - - for(int i = 0; i < num_nodes_on_rank; i++) { - node.coords.host(i, 0) = node_pos_on_rank_flat[i*3]; - node.coords.host(i, 1) = node_pos_on_rank_flat[i*3+1]; - node.coords.host(i, 2) = node_pos_on_rank_flat[i*3+2]; - } - - node.coords.update_device(); - -// ****************************************************************************************** -// Send the element-node connectivity data from the initial mesh to each rank -// ****************************************************************************************** - - // Send the element-node connectivity data from the initial mesh to each rank - std::vector nodes_in_elem_on_rank(num_elements_on_rank * num_nodes_per_elem); - - double t_scatter_elemnode_start = MPI_Wtime(); - - if (rank == 0) { - // Prepare element-node connectivity data for each rank - std::vector all_nodes_in_elem; - std::vector sendcounts(world_size); - std::vector displs(world_size); - - int displacement = 0; - for(int i = 0; i < world_size; i++) { - int num_connectivity_entries = elements_to_send[i].size() * num_nodes_per_elem; // num_nodes_per_elem nodes per element - sendcounts[i] = num_connectivity_entries; - displs[i] = displacement; - - // Copy element-node connectivity for rank i - for(int j = 0; j < elements_to_send[i].size(); j++) { - for(int k = 0; k < num_nodes_per_elem; k++) { - all_nodes_in_elem.push_back(initial_mesh.nodes_in_elem.host(elements_to_send[i][j], k)); - } - } - displacement += num_connectivity_entries; - } - // Send the connectivity data to each rank - MPI_Scatterv(all_nodes_in_elem.data(), sendcounts.data(), displs.data(), MPI_INT, - nodes_in_elem_on_rank.data(), num_elements_on_rank * num_nodes_per_elem, MPI_INT, - 0, MPI_COMM_WORLD); - } - else { - MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT, - nodes_in_elem_on_rank.data(), num_elements_on_rank * num_nodes_per_elem, MPI_INT, - 0, MPI_COMM_WORLD); - } - - MPI_Barrier(MPI_COMM_WORLD); - - double t_scatter_elemnode_end = MPI_Wtime(); - if(rank == 0) { - std::cout << " Finished scattering the element-node connectivity data from the initial mesh to each rank" << std::endl; - std::cout << " Scattering element-node connectivity took " - << (t_scatter_elemnode_end - t_scatter_elemnode_start) << " seconds." << std::endl; - } - - if (rank == 0 && print_info) { - - std::cout << "Rank " << rank << " received element-node connectivity (" - << num_elements_on_rank << " elements, " << nodes_in_elem_on_rank.size() << " entries):" << std::endl; - for (int elem = 0; elem < num_elements_on_rank; elem++) { - std::cout << " Element " << elem << " nodes: "; - for (int node = 0; node < num_nodes_per_elem; node++) { - int idx = elem * num_nodes_per_elem + node; - std::cout << nodes_in_elem_on_rank[idx] << " "; - } - std::cout << std::endl; - } - } - - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout<<" Finished scattering the element-node connectivity data from the initial mesh to each rank"< elem_elem_counts(world_size); - int total_elem_elem_entries = 0; - - - double t_scatter_elem_elem_start = MPI_Wtime(); - - if (rank == 0){ - // Calculate total number of connectivity entries for each rank - for(int i = 0; i < world_size; i++) { - elem_elem_counts[i] = 0; - for(int k = 0; k < elements_to_send[i].size(); k++) { - elem_elem_counts[i] += initial_mesh.num_elems_in_elem(elements_to_send[i][k]); - } - - if(print_info) std::cout << "Rank " << i << " will receive " << elem_elem_counts[i] << " element-element connectivity entries" << std::endl; - } - - // Print element-element connectivity entries for each rank in the initial mesh - if(print_info) { - for(int i = 0; i < world_size; i++) { - std::cout << std::endl; - std::cout << "Rank " << i << " will receive element-element connectivity entries for the following elements: "< elems_in_elem_on_rank(total_elem_elem_entries); - - // Now scatter the num_elems_in_elem for each element on each rank - std::vector num_elems_in_elem_per_rank(num_elements_on_rank); - - if (rank == 0) { - std::vector all_num_elems_in_elem; - std::vector displs_ee(world_size); - int displacement = 0; - - for(int i = 0; i < world_size; i++) { - displs_ee[i] = displacement; - for(int k = 0; k < elements_to_send[i].size(); k++) { - all_num_elems_in_elem.push_back(initial_mesh.num_elems_in_elem(elements_to_send[i][k])); - } - displacement += elements_to_send[i].size(); - } - - MPI_Scatterv(all_num_elems_in_elem.data(), elems_per_rank.data(), displs_ee.data(), MPI_INT, - num_elems_in_elem_per_rank.data(), num_elements_on_rank, MPI_INT, - 0, MPI_COMM_WORLD); - } else { - MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT, - num_elems_in_elem_per_rank.data(), num_elements_on_rank, MPI_INT, - 0, MPI_COMM_WORLD); - } - - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout<<" Finished scattering the actual element-element connectivity counts per element to each rank"< all_elems_in_elem; - std::vector sendcounts(world_size); - std::vector displs(world_size); - - int displacement = 0; - - for(int i = 0; i < world_size; i++) { - sendcounts[i] = elem_elem_counts[i]; - displs[i] = displacement; - - // Copy element-element connectivity for rank i - for(int k = 0; k < elements_to_send[i].size(); k++) { - for(int l = 0; l < initial_mesh.num_elems_in_elem(elements_to_send[i][k]); l++) { - all_elems_in_elem.push_back(initial_mesh.elems_in_elem(elements_to_send[i][k], l)); - } - } - displacement += elem_elem_counts[i]; - } - - // Send the element-element connectivity data to each rank using MPI_Scatterv - MPI_Scatterv(all_elems_in_elem.data(), sendcounts.data(), displs.data(), MPI_INT, - elems_in_elem_on_rank.data(), total_elem_elem_entries, MPI_INT, - 0, MPI_COMM_WORLD); - } - else { - MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT, - elems_in_elem_on_rank.data(), total_elem_elem_entries, MPI_INT, - 0, MPI_COMM_WORLD); - } - - MPI_Barrier(MPI_COMM_WORLD); - - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout<<" Finished receiving the actual element-element connectivity entries to each rank"<(num_nodes_on_rank, "mesh.local_to_global_node_mapping"); - mesh.local_to_global_elem_mapping = DCArrayKokkos(num_elements_on_rank, "mesh.local_to_global_elem_mapping"); - - for(int i = 0; i < num_nodes_on_rank; i++) { - mesh.local_to_global_node_mapping.host(i) = nodes_on_rank[i]; - } - - for(int i = 0; i < num_elements_on_rank; i++) { - mesh.local_to_global_elem_mapping.host(i) = elements_on_rank[i]; - } - - mesh.local_to_global_node_mapping.update_device(); - mesh.local_to_global_elem_mapping.update_device(); - - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout<<" Starting reverse mapping of the element-node connectivity from the global node ids to the local node ids"< vertloctab: - * CSR array [size vertlocnbr+1]: for each local vertex i, vertloctab[i] - * gives the index in edgeloctab where the neighbor list of vertex i begins. - * PT-Scotch expects this array to be of size vertlocnbr+1, where the difference - * vertloctab[i+1] - vertloctab[i] gives the number of edges for vertex i. - * - * - std::vector edgeloctab: - * CSR array [variable size]: a flattened list of *neighboring element global IDs*, - * in no particular order. For vertex i, its neighbors are located at - * edgeloctab[vertloctab[i]...vertloctab[i+1]-1]. - * In this compact CSR, these are global IDs (GIDs), enabling PT-Scotch to - * recognize edges both within and across ranks. - * - * - std::map elem_gid_to_offset: - * Helper map: For a given element global ID, gives the starting offset in - * the flattened neighbor array (elems_in_elem_on_rank) where this element's - * list of neighbor GIDs begins. This allows efficient neighbor list lookup. - * - * - (other arrays used, from mesh setup and communication phase) - * - elements_on_rank: vector of global element IDs owned by this rank. - * - num_elements_on_rank: number of owned elements. - * - num_elems_in_elem_per_rank: array, for each owned element, how many - * neighbors it has. - * - elems_in_elem_on_rank: flattened array of global neighbor IDs for all local elements. - * - **********************************************************************************/ - - // --- Step 1: Initialize the PT-Scotch distributed graph object on this MPI rank --- - SCOTCH_Dgraph dgraph; - if (SCOTCH_dgraphInit(&dgraph, MPI_COMM_WORLD) != 0) { - std::cerr << "[rank " << rank << "] SCOTCH_dgraphInit failed\n"; - MPI_Abort(MPI_COMM_WORLD, 1); - } - - // Set base value for numbering (0 for C-style arrays) - const SCOTCH_Num baseval = 0; - - // vertlocnbr: Number of elements (vertices) that are local to this MPI rank - const SCOTCH_Num vertlocnbr = static_cast(mesh.num_elems); - - // vertlocmax: Maximum possible local vertices (no holes, so identical to vertlocnbr) - const SCOTCH_Num vertlocmax = vertlocnbr; - - // --- Step 2: Build compact CSR arrays for PT-Scotch (vertloctab, edgeloctab) --- - // vertloctab: for each local mesh element [vertex], gives index in edgeloctab where its neighbor list begins - std::vector vertloctab(vertlocnbr + 1); - - // edgeloctab: flat array of neighbor global IDs for all local elements, built in order - std::vector edgeloctab; - edgeloctab.reserve(vertlocnbr * 6); // heuristic: assume typical mesh degree is ~6, for performance - - // Construct a map from element GID to its offset into elems_in_elem_on_rank (the array of neighbor GIDs) - // This allows, for a given element GID, quick lookup of where its neighbor list starts in the flat array. - std::map elem_gid_to_offset; - size_t current_offset = 0; - for (size_t k = 0; k < num_elements_on_rank; k++) { - elem_gid_to_offset[elements_on_rank[k]] = current_offset; - current_offset += num_elems_in_elem_per_rank[k]; - } - - // --- Step 3: Fill in the CSR arrays, looping over each locally-owned element --- - SCOTCH_Num offset = 0; // running count of edges encountered - - for (size_t lid = 0; lid < mesh.num_elems; ++lid) { - - // Record current edge offset for vertex lid in vertloctab - vertloctab[lid] = offset; - - // Obtain this local element's global ID (from mapping) - int elem_gid = mesh.local_to_global_elem_mapping.host(lid); - - // Find offset in the flattened neighbor array for this element's neighbor list - size_t elems_in_elem_offset = elem_gid_to_offset[elem_gid]; - - // For this element, find the count of its neighbors - // This requires finding its index in the elements_on_rank array - size_t idx = 0; - for (size_t k = 0; k < num_elements_on_rank; k++) { - if (elements_on_rank[k] == elem_gid) { - idx = k; - break; - } - } - size_t num_nbrs = num_elems_in_elem_per_rank[idx]; - - // Append each neighbor (by its GLOBAL elem GID) to edgeloctab - for (size_t j = 0; j < num_nbrs; j++) { - size_t neighbor_gid = elems_in_elem_on_rank[elems_in_elem_offset + j]; // This is a global element ID! - edgeloctab.push_back(static_cast(neighbor_gid)); - ++offset; // Increment running edge count - } - } - - // vertloctab[vertlocnbr] stores total number of edges written, finalizes the CSR structure - vertloctab[vertlocnbr] = offset; - - // edgelocnbr/edgelocsiz: Number of edge endpoints defined locally - // (PT-Scotch's distributed graphs allow edges to be replicated or owned by either endpoint) - const SCOTCH_Num edgelocnbr = offset; // total number of edge endpoints (sum of all local neighbor degrees) - const SCOTCH_Num edgelocsiz = edgelocnbr; // allocated size matches number of endpoints - - // Optionally print graph structure for debugging/validation - if (print_info) { - std::cout << "Rank " << rank << ": vertlocnbr = # of local elements(vertices) = " << vertlocnbr - << ", edgelocnbr = # of local edge endpoints = " << edgelocnbr << std::endl; - std::cout << "vertloctab (CSR row offsets): "; - for (size_t i = 0; i <= vertlocnbr; i++) { - std::cout << vertloctab[i] << " "; - } - std::cout << std::endl; - std::cout << "edgeloctab (first 20 neighbor GIDs): "; - for (size_t i = 0; i < std::min((size_t)20, edgeloctab.size()); i++) { - std::cout << edgeloctab[i] << " "; - } - std::cout << std::endl; - } - MPI_Barrier(MPI_COMM_WORLD); - - /************************************************************************** - * Step 4: Build the distributed graph using PT-Scotch's SCOTCH_dgraphBuild - * - * - PT-Scotch will use our CSR arrays. Since we use compact representation, - * most optional arrays ("veloloctab", "vlblloctab", "edgegsttab", "edloloctab") - * can be passed as nullptr. - * - edgeloctab contains *GLOBAL element GIDs* of neighbors. PT-Scotch uses this - * to discover connections across processor boundaries, so you do not have to - * encode ownership or partition information yourself. - **************************************************************************/ - int rc = SCOTCH_dgraphBuild( - &dgraph, - baseval, // start index (0) - vertlocnbr, // local vertex count (local elements) - vertlocmax, // local vertex max (no holes) - vertloctab.data(), // row offsets in edgeloctab - /*vendloctab*/ nullptr, // end of row offsets (compact CSR => nullptr) - /*veloloctab*/ nullptr, // vertex weights, not used - /*vlblloctab*/ nullptr, // vertex global labels (we use GIDs in edgeloctab) - edgelocnbr, // local edge endpoints count - edgelocsiz, // size of edge array - edgeloctab.data(), // global neighbor IDs for each local node - /*edgegsttab*/ nullptr, // ghost edge array, not used - /*edloloctab*/ nullptr // edge weights, not used - ); - if (rc != 0) { - std::cerr << "[rank " << rank << "] SCOTCH_dgraphBuild failed rc=" << rc << "\n"; - SCOTCH_dgraphFree(&dgraph); - MPI_Abort(MPI_COMM_WORLD, rc); - } - - // Optionally, print rank summary after graph build for further validation - if (print_info) { - SCOTCH_Num vertlocnbr_out; - SCOTCH_dgraphSize(&dgraph, &vertlocnbr_out, nullptr, nullptr, nullptr); - std::cout << "Rank " << rank << ": After dgraphBuild, vertlocnbr = " << vertlocnbr_out << std::endl; - } - MPI_Barrier(MPI_COMM_WORLD); - - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout<<" Finished building the distributed graph using PT-Scotch"<(world_size)); - - - - - // ===================== PT-Scotch Strategy Selection and Documentation ====================== - // The PT-Scotch "strategy" (stratdat here) controls the algorithms and heuristics used for partitioning. - // You can specify a string or build a strategy using functions that adjust speed, quality, and recursion. - // - // Common strategy flags (see "scotch.h", "ptscotch.h", and PT-Scotch documentation): - // - // - SCOTCH_STRATDEFAULT: Use the default (fast, reasonable quality) partitioning strategy. - // Useful for quick, generic partitions where quality is not critical. - // - // - SCOTCH_STRATSPEED: Aggressively maximizes speed (at the cost of cut quality). - // For large runs or test runs where speed is more important than minimizing edgecut. - // - // - SCOTCH_STRATQUALITY: Prioritizes partition *quality* (minimizing edge cuts, maximizing load balance). - // Slower than the default. Use when high-quality partitioning is desired. - // - // - SCOTCH_STRATBALANCE: Tradeoff between speed and quality for balanced workload across partitions. - // Use if load balance is more critical than cut size. - // - // Additional Options: - // - Strategy can also be specified as a string (see Scotch manual, e.g., "b{sep=m{...} ...}"). - // - Recursion count parameter (here, set to 0) controls strategy recursion depth (0 = automatic). - // - Imbalance ratio (here, 0.01) allows minor imbalance in part weight for better cut quality. - // - // Example usage: - // SCOTCH_stratDgraphMapBuild(&strat, SCOTCH_STRATQUALITY, nparts, 0, 0.01); - // ^ quality-focused, nparts=number of parts/ranks - // SCOTCH_stratDgraphMapBuild(&strat, SCOTCH_STRATSPEED, nparts, 0, 0.05); - // ^ speed-focused, allow 5% imbalance - // - // Reference: - // - https://gitlab.inria.fr/scotch/scotch/-/blob/master/doc/libptscotch.pdf - // - SCOTCH_stratDgraphMapBuild() and related "strategy" documentation. - // - // --------------- Set up the desired partitioning strategy here: --------------- - SCOTCH_Strat stratdat; // PT-Scotch strategy object: holds partitioning options/settings - SCOTCH_stratInit(&stratdat); - - // Select partitioning strategy for this run: - // Use SCOTCH_STRATQUALITY for best cut quality. - // To change: replace with SCOTCH_STRATDEFAULT, SCOTCH_STRATSPEED, or SCOTCH_STRATBALANCE as discussed above. - // Arguments: (strategy object, strategy flag, #parts, recursion (0=auto), imbalance ratio) - SCOTCH_stratDgraphMapBuild(&stratdat, SCOTCH_STRATQUALITY, world_size, 0, 0.01); - - // partloctab: output array mapping each local element (vertex) to a *target partition number* - // After partitioning, partloctab[i] gives the part-assignment (in [0,world_size-1]) for local element i. - std::vector partloctab(vertlocnbr); - rc = SCOTCH_dgraphMap(&dgraph, &archdat, &stratdat, partloctab.data()); - if (rc != 0) { - std::cerr << "[rank " << rank << "] SCOTCH_dgraphMap failed rc=" << rc << "\n"; - SCOTCH_stratExit(&stratdat); - SCOTCH_archExit(&archdat); - SCOTCH_dgraphFree(&dgraph); - MPI_Abort(MPI_COMM_WORLD, rc); - } - - // Clean up PT-Scotch strategy and architecture objects - SCOTCH_stratExit(&stratdat); - SCOTCH_archExit(&archdat); - - // Free the graph now that we have the partition assignments - SCOTCH_dgraphFree(&dgraph); - - /*************************************************************************** - * Step 7 (Optional): Print out the partitioning assignment per element - * - Each local element's local index lid and global ID (gid) are listed with the - * part to which PT-Scotch has assigned them. - ***************************************************************************/ - print_info = false; - for(int rank_id = 0; rank_id < world_size; rank_id++) { - if(rank_id == rank && print_info) { - for (size_t lid = 0; lid < mesh.num_elems; ++lid) { - size_t gid = mesh.local_to_global_elem_mapping.host(lid); - std::cout << "[rank " << rank_id << "] elem_local=" << lid << " gid=" << gid - << " -> part=" << partloctab[lid] << "\n"; - } - MPI_Barrier(MPI_COMM_WORLD); - } - MPI_Barrier(MPI_COMM_WORLD); - } - print_info = false; - - - -// ****************************************************************************************** -// Build the final mesh from the repartition -// ****************************************************************************************** - - - - MPI_Barrier(MPI_COMM_WORLD); - if (rank == 0) std::cout << "\n=== Starting Mesh Redistribution Phase ===\n"; - MPI_Barrier(MPI_COMM_WORLD); - - // -------------- Phase 1: Determine elements to send to each rank -------------- - std::vector> elems_to_send(world_size); - for (int lid = 0; lid < mesh.num_elems; ++lid) { - int dest = static_cast(partloctab[lid]); - int elem_gid = static_cast(mesh.local_to_global_elem_mapping.host(lid)); - elems_to_send[dest].push_back(elem_gid); - } - - // -------------- Phase 2: Exchange element GIDs -------------- - std::vector sendcounts(world_size), recvcounts(world_size); - for (int r = 0; r < world_size; ++r) - sendcounts[r] = static_cast(elems_to_send[r].size()); - - MPI_Alltoall(sendcounts.data(), 1, MPI_INT, recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD); - - MPI_Barrier(MPI_COMM_WORLD); - - // Compute displacements - std::vector sdispls(world_size), rdispls(world_size); - int send_total = 0, recv_total = 0; - for (int r = 0; r < world_size; ++r) { - sdispls[r] = send_total; - rdispls[r] = recv_total; - send_total += sendcounts[r]; - recv_total += recvcounts[r]; - } - - - // Flatten send buffer - std::vector sendbuf; - sendbuf.reserve(send_total); - for (int r = 0; r < world_size; ++r) - sendbuf.insert(sendbuf.end(), elems_to_send[r].begin(), elems_to_send[r].end()); - - // Receive new local element GIDs - std::vector recvbuf(recv_total); - MPI_Alltoallv(sendbuf.data(), sendcounts.data(), sdispls.data(), MPI_INT, - recvbuf.data(), recvcounts.data(), rdispls.data(), MPI_INT, MPI_COMM_WORLD); - - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout<<" Finished exchanging element GIDs"< new_elem_gids = recvbuf; - int num_new_elems = static_cast(new_elem_gids.size()); - - - if (print_info) { - std::cout << "[rank " << rank << "] new elems: " << num_new_elems << std::endl; - } - - // -------------- Phase 3: Send element–node connectivity -------------- - int nodes_per_elem = mesh.num_nodes_in_elem; - - // Flatten element-node connectivity by global node IDs - std::vector conn_sendbuf; - for (int r = 0; r < world_size; ++r) { - for (int gid : elems_to_send[r]) { - // find local element lid from gid - int lid = -1; - for (int i = 0; i < mesh.num_elems; ++i) - if (mesh.local_to_global_elem_mapping.host(i) == gid) { lid = i; break; } - - for (int j = 0; j < nodes_per_elem; j++) { - int node_lid = mesh.nodes_in_elem.host(lid, j); - int node_gid = mesh.local_to_global_node_mapping.host(node_lid); - conn_sendbuf.push_back(node_gid); - } - } - } - - // element-node connectivity counts (ints per dest rank) - std::vector conn_sendcounts(world_size), conn_recvcounts(world_size); - for (int r = 0; r < world_size; ++r) - conn_sendcounts[r] = sendcounts[r] * nodes_per_elem; - - MPI_Alltoall(conn_sendcounts.data(), 1, MPI_INT, conn_recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD); - - - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout<<" Finished exchanging element–node connectivity counts"< conn_sdispls(world_size), conn_rdispls(world_size); - int conn_send_total = 0, conn_recv_total = 0; - for (int r = 0; r < world_size; ++r) { - conn_sdispls[r] = conn_send_total; - conn_rdispls[r] = conn_recv_total; - conn_send_total += conn_sendcounts[r]; - conn_recv_total += conn_recvcounts[r]; - } - - std::vector conn_recvbuf(conn_recv_total); - MPI_Alltoallv(conn_sendbuf.data(), conn_sendcounts.data(), conn_sdispls.data(), MPI_INT, - conn_recvbuf.data(), conn_recvcounts.data(), conn_rdispls.data(), MPI_INT, MPI_COMM_WORLD); - - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout<<" Finished exchanging element–node connectivity"< node_gid_set(conn_recvbuf.begin(), conn_recvbuf.end()); - std::vector new_node_gids(node_gid_set.begin(), node_gid_set.end()); - int num_new_nodes = static_cast(new_node_gids.size()); - - // Build map gid→lid - std::unordered_map node_gid_to_lid; - for (int i = 0; i < num_new_nodes; ++i) - node_gid_to_lid[new_node_gids[i]] = i; - - if (print_info) - std::cout << "[rank " << rank << "] owns " << num_new_nodes << " unique nodes\n"; - - - // -------------- Phase 5: Request node coordinates -------------- - std::vector node_coords_sendbuf; - for (int r = 0; r < world_size; ++r) { - for (int gid : elems_to_send[r]) { - int lid = -1; - for (int i = 0; i < mesh.num_elems; ++i) - if (mesh.local_to_global_elem_mapping.host(i) == gid) { lid = i; break; } - - for (int j = 0; j < nodes_per_elem; j++) { - int node_lid = mesh.nodes_in_elem.host(lid, j); - int node_gid = mesh.local_to_global_node_mapping.host(node_lid); - - node_coords_sendbuf.push_back(node.coords.host(node_lid, 0)); - node_coords_sendbuf.push_back(node.coords.host(node_lid, 1)); - node_coords_sendbuf.push_back(node.coords.host(node_lid, 2)); - } - } - } - - // Each node is 3 doubles; same sendcounts scaling applies - std::vector coord_sendcounts(world_size), coord_recvcounts(world_size); - for (int r = 0; r < world_size; ++r) - coord_sendcounts[r] = sendcounts[r] * nodes_per_elem * 3; - - MPI_Alltoall(coord_sendcounts.data(), 1, MPI_INT, coord_recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD); - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout<<" Finished exchanging node coordinates counts"< coord_sdispls(world_size), coord_rdispls(world_size); - int coord_send_total = 0, coord_recv_total = 0; - for (int r = 0; r < world_size; ++r) { - coord_sdispls[r] = coord_send_total; - coord_rdispls[r] = coord_recv_total; - coord_send_total += coord_sendcounts[r]; - coord_recv_total += coord_recvcounts[r]; - } - - std::vector coord_recvbuf(coord_recv_total); - MPI_Alltoallv(node_coords_sendbuf.data(), coord_sendcounts.data(), coord_sdispls.data(), MPI_DOUBLE, - coord_recvbuf.data(), coord_recvcounts.data(), coord_rdispls.data(), MPI_DOUBLE, MPI_COMM_WORLD); - - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout<<" Finished exchanging node coordinates"<(num_new_nodes); - intermediate_mesh.local_to_global_elem_mapping = DCArrayKokkos(num_new_elems); - - // Fill global mappings - for (int i = 0; i < num_new_nodes; ++i) - intermediate_mesh.local_to_global_node_mapping.host(i) = new_node_gids[i]; - for (int i = 0; i < num_new_elems; ++i) - intermediate_mesh.local_to_global_elem_mapping.host(i) = new_elem_gids[i]; - - intermediate_mesh.local_to_global_node_mapping.update_device(); - intermediate_mesh.local_to_global_elem_mapping.update_device(); - - - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout<<" Starting reverse mapping of the element-node connectivity from the global node ids to the local node ids"<> node_gid_to_coords; - int coord_idx = 0; - for (int e = 0; e < num_new_elems; ++e) { - for (int j = 0; j < nodes_per_elem; j++) { - int node_gid = conn_recvbuf[e * nodes_per_elem + j]; - if (node_gid_to_coords.find(node_gid) == node_gid_to_coords.end()) { - node_gid_to_coords[node_gid] = { - coord_recvbuf[coord_idx*3 + 0], - coord_recvbuf[coord_idx*3 + 1], - coord_recvbuf[coord_idx*3 + 2] - }; - } - coord_idx++; - } - } - - // Now fill coordinates in node order - intermediate_node.initialize(num_new_nodes, 3, {node_state::coords}); - for (int i = 0; i < num_new_nodes; ++i) { - int node_gid = new_node_gids[i]; - auto it = node_gid_to_coords.find(node_gid); - if (it != node_gid_to_coords.end()) { - intermediate_node.coords.host(i, 0) = it->second[0]; - intermediate_node.coords.host(i, 1) = it->second[1]; - intermediate_node.coords.host(i, 2) = it->second[2]; - } - } - intermediate_node.coords.update_device(); - - // Connectivity rebuild - intermediate_mesh.build_connectivity(); - MPI_Barrier(MPI_COMM_WORLD); - - - -// ****************************************************************************************** -// Build the ghost elements -// ****************************************************************************************** - - double t_ghost_start = MPI_Wtime(); - - // First, gather the number of elements each rank owns - std::vector elem_counts(world_size); - - // int MPI_Allgather( - // const void* sendbuf, // Data to send from this process - // int sendcount, // Number of elements to send - // MPI_Datatype sendtype, // Type of send data - // void* recvbuf, // Buffer to receive all data - // int recvcount, // Number of elements to receive from each process - // MPI_Datatype recvtype, // Type of receive data - // MPI_Comm comm // Communicator - // ); - MPI_Allgather(&intermediate_mesh.num_elems, 1, MPI_INT, elem_counts.data(), 1, MPI_INT, MPI_COMM_WORLD); - MPI_Barrier(MPI_COMM_WORLD); - // Compute displacements - std::vector elem_displs(world_size); - int total_elems = 0; - for (int r = 0; r < world_size; ++r) { - elem_displs[r] = total_elems; - total_elems += elem_counts[r]; - } - - // Gather all element GIDs from all ranks - std::vector all_elem_gids(total_elems); - - // int MPI_Allgatherv( - // const void* sendbuf, // Data to send from this process - // int sendcount, // Number of elements THIS process sends - // MPI_Datatype sendtype, // Type of send data - // void* recvbuf, // Buffer to receive all data - // const int* recvcounts, // Array: number of elements from each process - // const int* displs, // Array: displacement for each process's data - // MPI_Datatype recvtype, // Type of receive data - // MPI_Comm comm // Communicator - // ); - MPI_Allgatherv(intermediate_mesh.local_to_global_elem_mapping.host_pointer(), intermediate_mesh.num_elems, MPI_UNSIGNED_LONG_LONG, - all_elem_gids.data(), elem_counts.data(), elem_displs.data(), - MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); - MPI_Barrier(MPI_COMM_WORLD); - // Build a map: element GID -> owning rank - std::map elem_gid_to_rank; - for (int r = 0; r < world_size; ++r) { - for (int i = 0; i < elem_counts[r]; ++i) { - size_t gid = all_elem_gids[elem_displs[r] + i]; - elem_gid_to_rank[gid] = r; - } - } - - // Strategy: Find ghost elements by checking neighbors of our boundary elements. - // A boundary element is one that has a neighbor owned by another rank. - // However, since build_connectivity() only includes locally-owned elements, - // we need to use a different approach: find elements on other ranks that share - // nodes with our locally-owned elements. - - // First, collect all nodes that belong to our locally-owned elements - std::set local_elem_nodes; - for (int lid = 0; lid < num_new_elems; ++lid) { - for (int j = 0; j < nodes_per_elem; j++) { - size_t node_lid = intermediate_mesh.nodes_in_elem.host(lid, j); - size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid); - local_elem_nodes.insert(node_gid); - } - } - - // Now collect element-to-node connectivity to send to all ranks - // Format: for each element, list its node GIDs (each entry is a pair: elem_gid, node_gid) - std::vector elem_node_conn; - int local_conn_size = 0; - - for (int lid = 0; lid < num_new_elems; ++lid) { - size_t elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(lid); - for (int j = 0; j < nodes_per_elem; j++) { - size_t node_lid = intermediate_mesh.nodes_in_elem.host(lid, j); - size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid); - elem_node_conn.push_back(elem_gid); - elem_node_conn.push_back(node_gid); - } - local_conn_size += nodes_per_elem * 2; // Each pair is 2 size_ts - } - - // Exchange element-node connectivity with all ranks using Allgather - // First, gather the sizes from each rank - std::vector conn_sizes(world_size); - MPI_Allgather(&local_conn_size, 1, MPI_INT, conn_sizes.data(), 1, MPI_INT, MPI_COMM_WORLD); - MPI_Barrier(MPI_COMM_WORLD); - // Compute displacements - std::vector conn_displs(world_size); - int total_conn = 0; - for (int r = 0; r < world_size; ++r) { - conn_displs[r] = total_conn; - total_conn += conn_sizes[r]; - } - - // Gather all element-node pairs from all ranks - std::vector all_conn(total_conn); - MPI_Allgatherv(elem_node_conn.data(), local_conn_size, MPI_UNSIGNED_LONG_LONG, - all_conn.data(), conn_sizes.data(), conn_displs.data(), - MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); - MPI_Barrier(MPI_COMM_WORLD); - // create a set for local_elem_gids - std::set local_elem_gids; - for (int i = 0; i < num_new_elems; ++i) { - local_elem_gids.insert(intermediate_mesh.local_to_global_elem_mapping.host(i)); - } - - // Build a map: node GID -> set of element GIDs that contain it (from other ranks) - std::map> node_to_ext_elem; - for (int r = 0; r < world_size; ++r) { - if (r == rank) continue; // Skip our own data - // Process pairs from rank r: conn_sizes[r] is in units of size_ts, so num_pairs = conn_sizes[r] / 2 - int num_pairs = conn_sizes[r] / 2; - for (int i = 0; i < num_pairs; ++i) { - // Each pair is 2 size_ts, starting at conn_displs[r] - int offset = conn_displs[r] + i * 2; - size_t elem_gid = all_conn[offset]; - size_t node_gid = all_conn[offset + 1]; - - // If this node is in one of our elements, then the element is a potential ghost - if (local_elem_nodes.find(node_gid) != local_elem_nodes.end()) { - // Check if this element is not owned by us - if (local_elem_gids.find(elem_gid) == local_elem_gids.end()) { - node_to_ext_elem[node_gid].insert(elem_gid); - } - } - } - } - - // Collect all unique ghost element GIDs - std::set ghost_elem_gids; - for (const auto& pair : node_to_ext_elem) { - for (size_t elem_gid : pair.second) { - ghost_elem_gids.insert(elem_gid); - } - } - - // Additional check: elements that are neighbors of our locally-owned elements - // but are owned by other ranks (these might already be in ghost_elem_gids, but check connectivity) - - for (int lid = 0; lid < num_new_elems; ++lid) { - size_t num_neighbors = intermediate_mesh.num_elems_in_elem(lid); - - for (size_t nbr_idx = 0; nbr_idx < num_neighbors; ++nbr_idx) { - size_t neighbor_lid = intermediate_mesh.elems_in_elem(lid, nbr_idx); - - if (neighbor_lid < static_cast(num_new_elems)) { - size_t neighbor_gid = intermediate_mesh.local_to_global_elem_mapping(neighbor_lid); - - // Check if neighbor is owned by this rank - auto it = elem_gid_to_rank.find(neighbor_gid); - if (it != elem_gid_to_rank.end() && it->second != rank) { - // Neighbor is owned by another rank - it's a ghost for us - ghost_elem_gids.insert(neighbor_gid); - } - } - } - } - - // Count unique ghost elements - intermediate_mesh.num_ghost_elems = ghost_elem_gids.size(); - - MPI_Barrier(MPI_COMM_WORLD); - double t_ghost_end = MPI_Wtime(); - - if (rank == 0) { - std::cout << " Finished calculating ghost elements" << std::endl; - std::cout << " Ghost element calculation took " << (t_ghost_end - t_ghost_start) << " seconds." << std::endl; - } - - // Print ghost element info if requested - print_info = false; - for(int i = 0; i < world_size; i++) { - MPI_Barrier(MPI_COMM_WORLD); - if(rank == i && print_info) { - std::cout << "[rank " << rank << "] owns " << num_new_elems - << " elements and has " << intermediate_mesh.num_ghost_elems << " ghost elements" << std::endl; - std::cout << "[rank " << rank << "] owned element global IDs: "; - for (int j = 0; j < intermediate_mesh.num_elems; j++) { - std::cout << intermediate_mesh.local_to_global_elem_mapping(j) << " "; - } - - // Print global IDs of ghost elements - std::cout << std::endl << "[rank " << rank << "] ghost element global IDs: "; - for (const auto& gid : ghost_elem_gids) { - std::cout << gid << " "; - } - std::cout << std::endl; - } - - MPI_Barrier(MPI_COMM_WORLD); - } - - - - // Build the connectivity that includes ghost elements - // Create an extended mesh with owned elements first, then ghost elements appended - - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout << " Starting to build extended mesh with ghost elements" << std::endl; - - // Step 1: Extract ghost element-node connectivity from all_conn - // Build a map: ghost_elem_gid -> vector of node_gids (ordered as in all_conn) - std::map> ghost_elem_to_nodes; - for (const size_t& ghost_gid : ghost_elem_gids) { - ghost_elem_to_nodes[ghost_gid].reserve(nodes_per_elem); - } - - // Extract nodes for each ghost element from all_conn - // The all_conn array has pairs (elem_gid, node_gid) for each rank's elements - for (int r = 0; r < world_size; ++r) { - if (r == rank) continue; // Skip our own data (we already have owned element connectivity) - int num_pairs = conn_sizes[r] / 2; - - // Process pairs in order - each element's nodes are contiguous - for (int i = 0; i < num_pairs; ++i) { - int offset = conn_displs[r] + i * 2; - size_t elem_gid = all_conn[offset]; - size_t node_gid = all_conn[offset + 1]; - - // If this is one of our ghost elements, record its node (in order) - auto it = ghost_elem_to_nodes.find(elem_gid); - if (it != ghost_elem_to_nodes.end()) { - it->second.push_back(node_gid); - } - } - } - - // Verify each ghost element has the correct number of nodes - for (auto& pair : ghost_elem_to_nodes) { - if (pair.second.size() != static_cast(nodes_per_elem)) { - std::cerr << "[rank " << rank << "] ERROR: Ghost element " << pair.first - << " has " << pair.second.size() << " nodes, expected " << nodes_per_elem << std::endl; - } - } - - // Step 2: Build extended node list (owned nodes first, then ghost-only nodes) - // Start with owned nodes - std::map node_gid_to_extended_lid; - int extended_node_lid = 0; - - // Add all owned nodes - for (int i = 0; i < intermediate_mesh.num_nodes; ++i) { - size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(i); - node_gid_to_extended_lid[node_gid] = extended_node_lid++; - } - - // Add ghost-only nodes (nodes that belong to ghost elements but not to owned elements) - std::set ghost_only_nodes; - for (const auto& pair : ghost_elem_to_nodes) { - for (size_t node_gid : pair.second) { - // Check if we already have this node - if (node_gid_to_extended_lid.find(node_gid) == node_gid_to_extended_lid.end()) { - ghost_only_nodes.insert(node_gid); - } - } - } - - // Assign extended local IDs to ghost-only nodes - for (size_t node_gid : ghost_only_nodes) { - node_gid_to_extended_lid[node_gid] = extended_node_lid++; - } - - int total_extended_nodes = extended_node_lid; - - // Step 3: Prepare requests for ghost node coordinates from owning ranks (if needed later) - // Build request list: for each ghost node, find an owning rank via any ghost element that contains it - std::map> rank_to_ghost_node_requests; - for (size_t node_gid : ghost_only_nodes) { - // Find which rank owns an element containing this node - // Look through ghost elements - for (const auto& pair : ghost_elem_to_nodes) { - size_t ghost_elem_gid = pair.first; - const std::vector& nodes = pair.second; - bool found = false; - for (size_t ngid : nodes) { - if (ngid == node_gid) { - found = true; - break; - } - } - if (found) { - auto owner_it = elem_gid_to_rank.find(ghost_elem_gid); - if (owner_it != elem_gid_to_rank.end()) { - rank_to_ghost_node_requests[owner_it->second].push_back(node_gid); - break; - } - } - } - } - - // Step 4: Build extended element list and node connectivity - // Owned elements: 0 to num_new_elems-1 (already have these) - // Ghost elements: num_new_elems to num_new_elems + num_ghost_elems - 1 - - // Create extended element-node connectivity array - int total_extended_elems = intermediate_mesh.num_elems + intermediate_mesh.num_ghost_elems; - std::vector> extended_nodes_in_elem(total_extended_elems); - - // Copy owned element connectivity (convert to extended node LIDs) - for (int lid = 0; lid < intermediate_mesh.num_elems; ++lid) { - extended_nodes_in_elem[lid].reserve(nodes_per_elem); - for (int j = 0; j < nodes_per_elem; j++) { - size_t node_lid = intermediate_mesh.nodes_in_elem.host(lid, j); - size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid); - int ext_lid = node_gid_to_extended_lid[node_gid]; - extended_nodes_in_elem[lid].push_back(ext_lid); - } - } - - // Add ghost element connectivity (map ghost node GIDs to extended node LIDs) - int ghost_elem_ext_lid = intermediate_mesh.num_elems; - std::vector ghost_elem_gids_ordered(ghost_elem_gids.begin(), ghost_elem_gids.end()); - std::sort(ghost_elem_gids_ordered.begin(), ghost_elem_gids_ordered.end()); - - for (size_t ghost_gid : ghost_elem_gids_ordered) { - auto it = ghost_elem_to_nodes.find(ghost_gid); - if (it == ghost_elem_to_nodes.end()) continue; - - extended_nodes_in_elem[ghost_elem_ext_lid].reserve(nodes_per_elem); - for (size_t node_gid : it->second) { - int ext_lid = node_gid_to_extended_lid[node_gid]; - extended_nodes_in_elem[ghost_elem_ext_lid].push_back(ext_lid); - } - ghost_elem_ext_lid++; - } - - MPI_Barrier(MPI_COMM_WORLD); - // Sequential rank-wise printing of extended mesh structure info - for (int r = 0; r < world_size; ++r) { - if (rank == r) { - std::cout << "[rank " << rank << "] Finished building extended mesh structure" << std::endl; - std::cout << "[rank " << rank << "] - Owned elements: " << intermediate_mesh.num_elems << std::endl; - std::cout << "[rank " << rank << "] - Ghost elements: " << ghost_elem_gids.size() << std::endl; - std::cout << "[rank " << rank << "] - Total extended elements: " << total_extended_elems << std::endl; - std::cout << "[rank " << rank << "] - Owned nodes: " << intermediate_mesh.num_nodes << std::endl; - std::cout << "[rank " << rank << "] - Ghost-only nodes: " << ghost_only_nodes.size() << std::endl; - std::cout << "[rank " << rank << "] - Total extended nodes: " << total_extended_nodes << std::endl; - std::cout << std::flush; - } - MPI_Barrier(MPI_COMM_WORLD); - } - - // The extended_nodes_in_elem vector now contains the connectivity for both owned and ghost elements - // Each element's nodes are stored using extended local node IDs (0-based, contiguous) - - // Build reverse maps: extended_lid -> gid for nodes and elements - std::vector extended_lid_to_node_gid(total_extended_nodes); - for (const auto& pair : node_gid_to_extended_lid) { - extended_lid_to_node_gid[pair.second] = pair.first; - } - - // Build extended element GID list: owned first, then ghost - std::vector extended_lid_to_elem_gid(total_extended_elems); - // Owned elements - for (int i = 0; i < intermediate_mesh.num_elems; ++i) { - extended_lid_to_elem_gid[i] = intermediate_mesh.local_to_global_elem_mapping.host(i); - } - // Ghost elements (in sorted order) - for (size_t idx = 0; idx < ghost_elem_gids_ordered.size(); ++idx) { - extended_lid_to_elem_gid[intermediate_mesh.num_elems + idx] = ghost_elem_gids_ordered[idx]; - } - - final_mesh.initialize_nodes(total_extended_nodes); - final_mesh.initialize_elems(total_extended_elems, 3); - final_mesh.local_to_global_node_mapping = DCArrayKokkos(total_extended_nodes); - final_mesh.local_to_global_elem_mapping = DCArrayKokkos(total_extended_elems); - for (int i = 0; i < total_extended_nodes; i++) { - final_mesh.local_to_global_node_mapping.host(i) = extended_lid_to_node_gid[i]; - } - for (int i = 0; i < total_extended_elems; i++) { - final_mesh.local_to_global_elem_mapping.host(i) = extended_lid_to_elem_gid[i]; - } - final_mesh.local_to_global_node_mapping.update_device(); - final_mesh.local_to_global_elem_mapping.update_device(); - - final_mesh.num_ghost_elems = ghost_elem_gids.size(); - final_mesh.num_ghost_nodes = ghost_only_nodes.size(); - - // Set owned counts for write_vtk (excludes ghost elements/nodes) - final_mesh.num_owned_elems = intermediate_mesh.num_elems; - final_mesh.num_owned_nodes = intermediate_mesh.num_nodes; - - - // Print num ghost elements and nodes on each rank sequentially - for (int r = 0; r < world_size; ++r) { - if (rank == r) { - std::cout << "*******[rank " << rank << "] - Ghost elements: " << final_mesh.num_ghost_elems << std::endl; - std::cout << "*******[rank " << rank << "] - Ghost-only nodes: " << final_mesh.num_ghost_nodes << std::endl; - } - MPI_Barrier(MPI_COMM_WORLD); - } - - - - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout<<" Starting reverse mapping of the element-node connectivity from the global node ids to the local node ids"< coordinate. - // 3. Use this map to fill final_node.coords. - - // 1. Build list of all global node IDs needed on this rank (owned + ghosts) - std::vector all_needed_node_gids(total_extended_nodes); - for (int i = 0; i < total_extended_nodes; ++i) { - all_needed_node_gids[i] = final_mesh.local_to_global_node_mapping.host(i); - } - - // 2. Build owned node GIDs and their coordinates - std::vector owned_gids(intermediate_mesh.num_nodes); - for (int i = 0; i < owned_gids.size(); ++i) - owned_gids[i] = intermediate_mesh.local_to_global_node_mapping.host(i); - - // 3. Gather all GIDs in the world that are needed anywhere (owned or ghosted, by any rank) - // so we can distribute the needed coordinate data. - // The easiest is to Allgather everyone's "owned_gids" and coords - - int local_owned_count = static_cast(owned_gids.size()); - std::vector owned_counts(world_size, 0); - if (local_owned_count < 0) local_owned_count = 0; // Clean up possibility of -1 - - // a) Gather counts - owned_counts.resize(world_size, 0); - MPI_Allgather(&local_owned_count, 1, MPI_INT, owned_counts.data(), 1, MPI_INT, MPI_COMM_WORLD); - - // b) Displacements and total - std::vector owned_displs(world_size,0); - int total_owned = 0; - for (int r=0; r all_owned_gids(total_owned); - MPI_Allgatherv(owned_gids.data(), local_owned_count, MPI_UNSIGNED_LONG_LONG, - all_owned_gids.data(), owned_counts.data(), owned_displs.data(), - MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); - - // d) Global coords (size: total_owned x 3) - std::vector owned_coords_send(3*local_owned_count, 0.0); - for (int i=0; i all_owned_coords(3 * total_owned, 0.0); - - // Create coordinate-specific counts and displacements (in units of doubles, not nodes) - std::vector coord_counts(world_size); - std::vector coord_displs(world_size); - for (int r=0; r coord[3] - std::unordered_map> gid_to_coord; - for (int i=0; i xyz = { - all_owned_coords[3*i+0], - all_owned_coords[3*i+1], - all_owned_coords[3*i+2] - }; - gid_to_coord[all_owned_gids[i]] = xyz; - } - - // 4. Finally, fill final_node.coords with correct coordinates. - for (int i = 0; i < total_extended_nodes; ++i) { - size_t gid = final_mesh.local_to_global_node_mapping.host(i); - auto it = gid_to_coord.find(gid); - if (it != gid_to_coord.end()) { - final_node.coords.host(i,0) = it->second[0]; - final_node.coords.host(i,1) = it->second[1]; - final_node.coords.host(i,2) = it->second[2]; - } else { - // Could happen if there's a bug: fill with zeros for safety - final_node.coords.host(i,0) = 0.0; - final_node.coords.host(i,1) = 0.0; - final_node.coords.host(i,2) = 0.0; - } - } - final_node.coords.update_device(); - - - - - // -------------------------------------------------------------------------------------- - // Build reverse map via global IDs: for each local element gid, find ranks that ghost it. - // Steps: - // 1) Each rank contributes its ghost element GIDs. - // 2) Allgatherv ghost GIDs to build gid -> [ranks that ghost it]. - // 3) For each locally-owned element gid, lookup ranks that ghost it and record targets. - // -------------------------------------------------------------------------------------- - std::vector>> boundary_elem_targets(intermediate_mesh.num_elems); - - // Prepare local ghost list as vector - std::vector ghost_gids_vec; - ghost_gids_vec.reserve(ghost_elem_gids.size()); - for (const auto &g : ghost_elem_gids) ghost_gids_vec.push_back(g); - - // Exchange counts - std::vector ghost_counts(world_size, 0); - int local_ghost_count = static_cast(ghost_gids_vec.size()); - MPI_Allgather(&local_ghost_count, 1, MPI_INT, ghost_counts.data(), 1, MPI_INT, MPI_COMM_WORLD); - - // Displacements and recv buffer - std::vector ghost_displs(world_size, 0); - int total_ghosts = 0; - for (int r = 0; r < world_size; ++r) { - ghost_displs[r] = total_ghosts; - total_ghosts += ghost_counts[r]; - } - std::vector all_ghost_gids(total_ghosts); - - // Gather ghost gids - MPI_Allgatherv(ghost_gids_vec.data(), local_ghost_count, MPI_UNSIGNED_LONG_LONG, - all_ghost_gids.data(), ghost_counts.data(), ghost_displs.data(), - MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); - - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout << " Finished gathering ghost element GIDs" << std::endl; - - - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout << " Starting to build the reverse map for communication" << std::endl; - // Build map gid -> ranks that ghost it - std::unordered_map> gid_to_ghosting_ranks; - gid_to_ghosting_ranks.reserve(static_cast(total_ghosts)); - for (int r = 0; r < world_size; ++r) { - int cnt = ghost_counts[r]; - int off = ghost_displs[r]; - for (int i = 0; i < cnt; ++i) { - size_t g = all_ghost_gids[off + i]; - gid_to_ghosting_ranks[g].push_back(r); - } - } - - // For each local element, list destinations: ranks that ghost our gid - for (int elem_lid = 0; elem_lid < intermediate_mesh.num_elems; elem_lid++) { - size_t local_elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(elem_lid); - auto it = gid_to_ghosting_ranks.find(local_elem_gid); - if (it == gid_to_ghosting_ranks.end()) continue; - const std::vector &dest_ranks = it->second; - for (int rr : dest_ranks) { - if (rr == rank) continue; - boundary_elem_targets[elem_lid].push_back(std::make_pair(rr, local_elem_gid)); - } - } - - std::cout.flush(); - MPI_Barrier(MPI_COMM_WORLD); - // Optional: print a compact summary of reverse map for verification (limited output) - for(int i = 0; i < world_size; i++) { - if (rank == i && print_info) { - std::cout << std::endl; - for (int elem_lid = 0; elem_lid < intermediate_mesh.num_elems; elem_lid++) { - - size_t local_elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(elem_lid); - if (boundary_elem_targets[elem_lid].empty()) - { - std::cout << "[rank " << rank << "] " << "elem_lid: "<< elem_lid <<" - elem_gid: " << local_elem_gid << " sends to: no ghost elements" << std::endl; - } - else - { - std::cout << "[rank " << rank << "] " << "elem_lid: "<< elem_lid <<" - elem_gid: " << local_elem_gid << " sends to: "; - int shown = 0; - for (const auto &pr : boundary_elem_targets[elem_lid]) { - if (shown >= 12) { std::cout << " ..."; break; } - std::cout << "(r" << pr.first << ":gid " << pr.second << ") "; - shown++; - } - std::cout << std::endl; - } - } - std::cout.flush(); - } - MPI_Barrier(MPI_COMM_WORLD); - } - - print_info = false; - - - MPI_Barrier(MPI_COMM_WORLD); - - - - - - - for(int i = 0; i < world_size; i++) { - if(rank == i && print_info) { - print_rank_mesh_info(intermediate_mesh, i); - } - MPI_Barrier(MPI_COMM_WORLD); - } - MPI_Barrier(MPI_COMM_WORLD); - + partition_mesh(initial_mesh, final_mesh, initial_node, final_node, world_size, rank); // write_vtk(intermediate_mesh, intermediate_node, rank); + MPI_Barrier(MPI_COMM_WORLD); write_vtu(final_mesh, final_node, rank, MPI_COMM_WORLD); - - MPI_Barrier(MPI_COMM_WORLD); // Stop timer and get execution time @@ -2023,7 +78,5 @@ int main(int argc, char** argv) { MATAR_FINALIZE(); MPI_Finalize(); - - return 0; } \ No newline at end of file diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h index 7e6f6c83..10d8838f 100644 --- a/examples/mesh_decomp/mesh_io.h +++ b/examples/mesh_decomp/mesh_io.h @@ -116,7 +116,6 @@ inline int PointIndexFromIJK(int i, int j, int k, const int* order) ///////////////////////////////////////////////////////////////////////////// void build_3d_box( Mesh_t& mesh, - GaussPoint_t& GaussPoints, node_t& node, double origin[3], double length[3], @@ -252,26 +251,7 @@ void build_3d_box( // ---- Update host data ---- - // material point values - // State.MaterialPoints.den.update_host(); - // State.MaterialPoints.pres.update_host(); - // State.MaterialPoints.stress.update_host(); - // State.MaterialPoints.sspd.update_host(); - // State.MaterialPoints.sie.update_host(); - // State.MaterialPoints.mass.update_host(); - // State.MaterialPoints.conductivity.update_host(); - // State.MaterialPoints.temp_grad.update_host(); - // State.MaterialPoints.eroded.update_host(); - - - // gauss point values - // State.GaussPoints.vol.update_host(); - - // nodal values node.coords.update_host(); - // State.node.vel.update_host(); - // State.node.mass.update_host(); - // State.node.temp.update_host(); Kokkos::fence(); From 3ac20b2e6a509a9c4455f628ba13cf76951762c3 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Mon, 3 Nov 2025 14:42:23 -0600 Subject: [PATCH 15/52] ENH: Developing communication plan, WIP --- examples/mesh_decomp/communication_plan.h | 572 ++++++++++++++++++++++ examples/mesh_decomp/decomp_utils.h | 266 ++++++++-- examples/mesh_decomp/mesh_decomp.cpp | 3 +- examples/mesh_decomp/mesh_io.h | 5 +- 4 files changed, 797 insertions(+), 49 deletions(-) create mode 100644 examples/mesh_decomp/communication_plan.h diff --git a/examples/mesh_decomp/communication_plan.h b/examples/mesh_decomp/communication_plan.h new file mode 100644 index 00000000..83d2cb46 --- /dev/null +++ b/examples/mesh_decomp/communication_plan.h @@ -0,0 +1,572 @@ +/** + * @struct CommunicationPlan + * @brief Manages efficient MPI communication for ghost element and node data exchange + * + * Pure data-oriented design with only flat, contiguous arrays for maximum cache efficiency. + * Designed to be embedded in distributed data structures for automatic ghost synchronization. + * + * Usage pattern in distributed structures: + * node.velocity.comm() -> automatically syncs ghost nodes + * elem.density.comm() -> automatically syncs ghost elements + * + * Memory layout philosophy: + * - Only std::vector (int, size_t, double) + * - CSR-style indexing for variable-length per-rank data + * - No std::map, std::set, std::pair, or nested containers + * - Pre-allocated MPI buffers to avoid repeated allocations + * - Separate element and node communication plans + */ + struct CommunicationPlan { + + // ======================================================================== + // CORE DATA STRUCTURES - FLAT ARRAYS ONLY + // ======================================================================== + + + // --- Ghost Send Plan: Owned elements/nodes -> destination ranks --- (Works for both elements and nodes) + int num_send_ranks; // Number of destination ranks + std::vector send_rank_ids; // [size: num_send_ranks] Destination rank IDs + std::vector send_ghost_offsets; // [size: num_send_ranks+1] CSR offsets into send_ghost_lids + std::vector send_ghost_lids; // [size: total_send_ghosts] Local IDs of owned elements/nodes to send + std::vector send_ghost_gids; // [size: total_send_ghosts] Global IDs (for debug/validation) + + // --- Ghost Receive Plan: Ghost elements/nodes <- source ranks --- (Works for both elements and nodes) + int num_recv_ranks; // Number of source ranks + std::vector recv_rank_ids; // [size: num_recv_ranks] Source rank IDs + std::vector recv_ghost_offsets; // [size: num_recv_ranks+1] CSR offsets into recv_ghost_lids + std::vector recv_ghost_lids; // [size: total_recv_ghosts] Local IDs of ghost elements/nodes (>= num_owned) + std::vector recv_ghost_gids; // [size: total_recv_ghosts] Global IDs + + + // --- MPI Communication Buffers (pre-allocated, reusable) --- + std::vector ghost_send_buffer; // Flat buffer for ghost data + std::vector ghost_recv_buffer; // Flat buffer for ghost data + + std::vector send_requests; // Request handles for sends + std::vector recv_requests; // Request handles for receives + std::vector mpi_statuses; // Status array for MPI_Waitall + + // --- Persistent communication (optional optimization) --- + std::vector persistent_send_requests; + std::vector persistent_recv_requests; + bool has_persistent_comm; + + + // --- Distributed Graph Topology for Neighborhood Collectives --- + MPI_Comm graph_comm; // Graph communicator encoding sparse communication pattern + bool has_graph_comm; // Whether graph communicator is initialized + + // Counts and displacements for MPI_Neighbor_alltoallv + std::vector send_counts; // [num_send_ranks] Number of items to send per neighbor + std::vector send_displs; // [num_send_ranks] Displacements in send buffer + std::vector recv_counts; // [num_recv_ranks] Number of items to recv per neighbor + std::vector recv_displs; // [num_recv_ranks] Displacements in recv buffer + + // --- Persistent Neighborhood Collectives (MPI-4.0+) --- + MPI_Request persistent_neighbor_request; // Persistent request for neighborhood collective + bool has_persistent_neighbor; // Whether persistent neighborhood is initialized + int persistent_num_fields; // Fields per item for persistent request + + + // ======================================================================== + // CONSTRUCTOR / INITIALIZATION + // ======================================================================== + + CommunicationPlan() + : num_send_ranks(0), num_recv_ranks(0), + has_persistent_comm(false), + has_graph_comm(false), + has_persistent_neighbor(false), + graph_comm(MPI_COMM_NULL), + persistent_neighbor_request(MPI_REQUEST_NULL), + persistent_num_fields(0) {} + + + // Destructor to free MPI resources + ~CommunicationPlan() { + // Free persistent neighborhood collective + if (has_persistent_neighbor && persistent_neighbor_request != MPI_REQUEST_NULL) { + MPI_Request_free(&persistent_neighbor_request); + } + + // Free graph communicator + if (has_graph_comm && graph_comm != MPI_COMM_NULL) { + MPI_Comm_free(&graph_comm); + } + } + + + /** + * @brief Build communication plan from mesh with flat array inputs + * @param mesh Reference to partitioned mesh (with ghost elements/nodes) + * @param world_size Number of MPI ranks + * @param my_rank Current MPI rank ID + * @param boundary_ghost_dest_ranks Flat array of destination ranks for boundary elements [size: sum of neighbors] + * @param boundary_ghsot_dest_offsets CSR offsets: boundary_ghost_dest_offsets[elem_lid] = start index in boundary_ghost_dest_ranks + * @param boundary_ghost_dest_gids Flat array of global ghost IDs to send [size: sum of neighbors] + * @param all_ghost_gids All ghost global IDs across all ranks + * @param all_ghost_owner_ranks Owner rank for each ghost GID + * + * This build() function takes only flat arrays as input (no std::map, std::set, std::pair). + * The caller must pre-process the mesh data into flat CSR-style arrays. + * + * Implementation: + * 1. Group sends/receives by rank using flat arrays and CSR indexing + * 2. Pre-allocate all MPI buffers + * 3. Store everything in contiguous memory + */ + void build( + const Mesh_t& mesh, + int world_size, + int my_rank, + const int* boundary_ghost_dest_ranks, // Flat array of dest ranks + const int* boundary_ghost_dest_offsets, // CSR offsets [size: num_owned_ghosts+1] + const size_t* boundary_ghost_dest_gids, // Flat array of ghost GIDs + const size_t* all_ghost_gids, // All ghost GIDs + const int* all_ghost_owner_ranks, // Owner ranks indexed by GID + ); + + + // ======================================================================== + // COMMUNICATION INTERFACE - FOR DISTRIBUTED DATA STRUCTURES + // ======================================================================== + + /** + * @brief Pack and exchange data with automatic ghost synchronization + * @param data_ptr Pointer to data array [size: num_total_items * stride] + * @param num_fields Number of fields per item (stride) + * @param item_type 0=elements, 1=nodes + * @param comm MPI communicator + * @param blocking If true, waits for completion before returning + * + * This is the main interface for distributed structures like: + * node.velocity.comm() internally calls: + * comm_plan.communicate(node.velocity.data(), 3, 1, MPI_COMM_WORLD, true) + */ + void communicate(double* data_ptr, int num_fields, int item_type, + MPI_Comm comm = MPI_COMM_WORLD, bool blocking = true); + + + /** + * @brief Non-blocking version: initiate communication + * Returns immediately; user must call wait_communication() + */ + void communicate_begin(double* data_ptr, int num_fields, int item_type, + MPI_Comm comm = MPI_COMM_WORLD); + + + /** + * @brief Wait for non-blocking communication to complete + */ + void wait_communication(double* data_ptr, int num_fields, int item_type); + + + // ======================================================================== + // LOW-LEVEL PACK/UNPACK (for manual control) + // ======================================================================== + + /** + * @brief Pack element data from contiguous array into send buffer + * @param data_ptr Pointer to element data [size: num_total_elems * num_fields] + * @param num_fields Stride (fields per element) + * + * Packs data in layout: [elem0_field0, elem0_field1, ..., elem1_field0, ...] + */ + void pack_ghosts(const double* data_ptr, int num_fields, int field_dimension); + + + /** + * @brief Unpack received element data into ghost elements + */ + void unpack_ghosts(double* data_ptr, int num_fields, int field_dimension); + + + + // ======================================================================== + // MPI EXCHANGE PRIMITIVES + // ======================================================================== + + /** + * @brief Execute MPI_Isend/Irecv for elements + */ + void exchange_ghosts_begin(int num_fields, int field_dimension, MPI_Comm comm = MPI_COMM_WORLD); + + + /** + * @brief Wait for element exchange to complete + */ + void exchange_ghosts_wait(); + + + + // ======================================================================== + // PERSISTENT COMMUNICATION (OPTIMIZATION) + // ======================================================================== + + /** + * @brief Setup persistent MPI communication handles (one-time setup) + * Call once after build(), then use start_persistent/wait_persistent + */ + void init_persistent(int elem_fields, int node_fields, MPI_Comm comm = MPI_COMM_WORLD); + + + /** + * @brief Start persistent send/recv (must call pack_* first) + */ + void start_persistent(); + + + /** + * @brief Wait for persistent communication (then call unpack_*) + */ + void wait_persistent(); + + + /** + * @brief Free persistent communication handles + */ + void free_persistent(); + + + // ======================================================================== + // NEIGHBORHOOD COLLECTIVES (MPI-3.0+) + // ======================================================================== + + /** + * @brief Create distributed graph communicator from communication pattern + * + * Call this ONCE after populating send_rank_ids and recv_rank_ids. + * The graph communicator encodes the sparse communication topology and is + * reused for all subsequent neighborhood collective calls. + * + * @param base_comm Base communicator (usually MPI_COMM_WORLD) + * + * Example from your output: + * rank 0 sends to: {2, 3, 4, 10, 11} + * rank 0 receives from: {computed from ghost ownership} + * + * This creates a directed graph where edges represent communication channels. + * MPI can optimize routing and minimize network contention. + * + * Requirements: MPI-3.0+ (2012) + */ + void create_graph_communicator(MPI_Comm base_comm = MPI_COMM_WORLD); + + + /** + * @brief Exchange ghost data using MPI_Neighbor_alltoallv + * + * Uses the pre-created graph communicator for efficient sparse communication. + * This is cleaner than manual Isend/Irecv loops and allows MPI to optimize. + * + * @param data_ptr Pointer to data array [size: num_total_items * num_fields] + * @param num_fields Number of fields per item (e.g., 3 for velocity) + * + * Workflow: + * 1. Pack owned items into send buffer + * 2. Call MPI_Neighbor_alltoallv (blocking but fast with graph_comm) + * 3. Unpack ghost items from receive buffer + * + * The graph_comm is reused each call - only pack/unpack overhead per timestep. + * + * Requirements: Must call create_graph_communicator() once before using this. + */ + void exchange_ghosts_neighborhood(double* data_ptr, int num_fields); + + + /** + * @brief Initialize persistent neighborhood collective (MPI-4.0+) + * + * Creates a persistent MPI request that pre-allocates all internal buffers + * and communication paths. Provides maximum performance for repeated exchanges + * with the same num_fields. + * + * @param num_fields Number of fields per item (must be same for all timesteps) + * + * Call once during setup: + * comm_plan.create_graph_communicator(MPI_COMM_WORLD); + * comm_plan.init_persistent_neighborhood(3); // For 3D velocity + * + * Then use exchange_ghosts_persistent() each timestep. + * + * Requirements: MPI-4.0+ (2021). Check with: mpirun --version + */ + void init_persistent_neighborhood(int num_fields); + + + /** + * @brief Exchange ghosts using persistent neighborhood collective (FASTEST) + * + * Must call init_persistent_neighborhood() once before using this. + * This is the fastest ghost exchange method for fixed communication patterns. + * + * @param data_ptr Pointer to data array [size: num_total_items * num_fields] + * + * Workflow: + * 1. Pack data into same send buffer used during init + * 2. MPI_Start() - extremely fast, no setup overhead + * 3. MPI_Wait() - wait for completion + * 4. Unpack from receive buffer + * + * Typical speedup vs standard neighborhood: 1.2-1.5x + * + * Note: Falls back to exchange_ghosts_neighborhood() if MPI-4 unavailable. + */ + void exchange_ghosts_persistent(double* data_ptr); + + + /** + * @brief Free persistent neighborhood collective resources + * + * Call at end of simulation to release MPI resources. + * Automatically called by destructor if not explicitly freed. + */ + void free_persistent_neighborhood(); + + + // ======================================================================== + // UTILITIES + // ======================================================================== + + void print_summary(int rank) const; + bool validate(MPI_Comm comm = MPI_COMM_WORLD) const; + size_t send_volume(int elem_fields, int node_fields) const; + size_t recv_volume(int elem_fields, int node_fields) const; + bool needs_communication() const; + int num_neighbor_ranks() const; + + + // ======================================================================== + // INLINE IMPLEMENTATIONS - NEIGHBORHOOD COLLECTIVES + // ======================================================================== + + /** + * @brief Create distributed graph communicator from communication pattern + */ + inline void create_graph_communicator(MPI_Comm base_comm) { + + if (has_graph_comm) { + std::cerr << "Warning: Graph communicator already created, skipping." << std::endl; + return; + } + + int indegree = num_recv_ranks; // Number of ranks we receive FROM + int outdegree = num_send_ranks; // Number of ranks we send TO + + // Create the distributed graph communicator + // MPI_Dist_graph_create_adjacent signature: + // (comm_old, indegree, sources[], sourceweights, outdegree, dests[], destweights, + // info, reorder, comm_dist_graph) + int reorder = 0; // Don't reorder ranks (keep same as base_comm) + + MPI_Dist_graph_create_adjacent( + base_comm, // Base communicator + indegree, // We receive from num_recv_ranks neighbors + recv_rank_ids.data(), // Source ranks (we receive from these) + MPI_UNWEIGHTED, // No edge weights for sources + outdegree, // We send to num_send_ranks neighbors + send_rank_ids.data(), // Destination ranks (we send to these) + MPI_UNWEIGHTED, // No edge weights for destinations + MPI_INFO_NULL, // No special hints + reorder, // Don't reorder ranks + &graph_comm // Output: new graph communicator + ); + + has_graph_comm = true; + + // Pre-allocate counts and displacements arrays + send_counts.resize(num_send_ranks); + send_displs.resize(num_send_ranks); + recv_counts.resize(num_recv_ranks); + recv_displs.resize(num_recv_ranks); + } + + + /** + * @brief Exchange ghost data using MPI_Neighbor_alltoallv + */ + inline void exchange_ghosts_neighborhood(double* data_ptr, int num_fields) { + + if (!has_graph_comm) { + std::cerr << "Error: Must call create_graph_communicator() first!" << std::endl; + return; + } + + // 1. Pack send buffer from owned items + int total_send = send_ghost_lids.size(); + ghost_send_buffer.resize(total_send * num_fields); + + for (size_t i = 0; i < send_ghost_lids.size(); i++) { + int local_id = send_ghost_lids[i]; + for (int f = 0; f < num_fields; f++) { + ghost_send_buffer[i * num_fields + f] = data_ptr[local_id * num_fields + f]; + } + } + + // 2. Update counts and displacements for this num_fields + for (int i = 0; i < num_send_ranks; i++) { + int start_idx = send_ghost_offsets[i]; + int end_idx = send_ghost_offsets[i + 1]; + send_counts[i] = (end_idx - start_idx) * num_fields; + send_displs[i] = start_idx * num_fields; + } + + int total_recv = recv_ghost_lids.size(); + ghost_recv_buffer.resize(total_recv * num_fields); + + for (int i = 0; i < num_recv_ranks; i++) { + int start_idx = recv_ghost_offsets[i]; + int end_idx = recv_ghost_offsets[i + 1]; + recv_counts[i] = (end_idx - start_idx) * num_fields; + recv_displs[i] = start_idx * num_fields; + } + + // 3. Execute neighborhood collective (BLOCKING but fast with graph_comm) + // MPI_Neighbor_alltoallv signature: + // (sendbuf, sendcounts[], sdispls[], sendtype, + // recvbuf, recvcounts[], rdispls[], recvtype, comm) + MPI_Neighbor_alltoallv( + ghost_send_buffer.data(), // Send buffer + send_counts.data(), // Send counts per neighbor + send_displs.data(), // Send displacements + MPI_DOUBLE, // Send type + ghost_recv_buffer.data(), // Receive buffer + recv_counts.data(), // Receive counts per neighbor + recv_displs.data(), // Receive displacements + MPI_DOUBLE, // Receive type + graph_comm // Graph communicator (NOT MPI_COMM_WORLD!) + ); + + // 4. Unpack receive buffer into ghost items + for (size_t i = 0; i < recv_ghost_lids.size(); i++) { + int ghost_local_id = recv_ghost_lids[i]; + for (int f = 0; f < num_fields; f++) { + data_ptr[ghost_local_id * num_fields + f] = ghost_recv_buffer[i * num_fields + f]; + } + } + } + + + /** + * @brief Initialize persistent neighborhood collective (MPI-4.0+) + */ + inline void init_persistent_neighborhood(int num_fields) { + + if (!has_graph_comm) { + std::cerr << "Error: Must call create_graph_communicator() first!" << std::endl; + return; + } + + if (has_persistent_neighbor) { + std::cerr << "Warning: Persistent neighborhood already initialized, freeing and re-creating." << std::endl; + free_persistent_neighborhood(); + } + + persistent_num_fields = num_fields; + + // Allocate buffers + int total_send = send_ghost_lids.size(); + int total_recv = recv_ghost_lids.size(); + ghost_send_buffer.resize(total_send * num_fields); + ghost_recv_buffer.resize(total_recv * num_fields); + + // Setup counts and displacements for persistent request + for (int i = 0; i < num_send_ranks; i++) { + int start_idx = send_ghost_offsets[i]; + int end_idx = send_ghost_offsets[i + 1]; + send_counts[i] = (end_idx - start_idx) * num_fields; + send_displs[i] = start_idx * num_fields; + } + + for (int i = 0; i < num_recv_ranks; i++) { + int start_idx = recv_ghost_offsets[i]; + int end_idx = recv_ghost_offsets[i + 1]; + recv_counts[i] = (end_idx - start_idx) * num_fields; + recv_displs[i] = start_idx * num_fields; + } + +#if MPI_VERSION >= 4 + // MPI-4.0+ persistent neighborhood collective + // MPI_Neighbor_alltoallv_init signature (similar to MPI_Neighbor_alltoallv but creates request): + // (sendbuf, sendcounts[], sdispls[], sendtype, + // recvbuf, recvcounts[], rdispls[], recvtype, comm, info, request) + MPI_Neighbor_alltoallv_init( + ghost_send_buffer.data(), send_counts.data(), send_displs.data(), MPI_DOUBLE, + ghost_recv_buffer.data(), recv_counts.data(), recv_displs.data(), MPI_DOUBLE, + graph_comm, + MPI_INFO_NULL, + &persistent_neighbor_request + ); + has_persistent_neighbor = true; +#else + int rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + if (rank == 0) { + std::cerr << "Warning: MPI-4.0 required for persistent neighborhood collectives" << std::endl; + std::cerr << " Detected MPI version: " << MPI_VERSION << "." << MPI_SUBVERSION << std::endl; + std::cerr << " Will fall back to standard neighborhood collective" << std::endl; + } + has_persistent_neighbor = false; +#endif + } + + + /** + * @brief Exchange ghosts using persistent neighborhood collective (FASTEST) + */ + inline void exchange_ghosts_persistent(double* data_ptr) { + +#if MPI_VERSION >= 4 + if (!has_persistent_neighbor) { + std::cerr << "Error: Must call init_persistent_neighborhood() first!" << std::endl; + std::cerr << " Falling back to standard neighborhood collective..." << std::endl; + exchange_ghosts_neighborhood(data_ptr, persistent_num_fields); + return; + } + + // 1. Pack send buffer (same memory location as during init) + for (size_t i = 0; i < send_ghost_lids.size(); i++) { + int local_id = send_ghost_lids[i]; + for (int f = 0; f < persistent_num_fields; f++) { + ghost_send_buffer[i * persistent_num_fields + f] = + data_ptr[local_id * persistent_num_fields + f]; + } + } + + // 2. Start persistent request (VERY fast - no setup overhead) + MPI_Start(&persistent_neighbor_request); + + // 3. Wait for completion + MPI_Wait(&persistent_neighbor_request, MPI_STATUS_IGNORE); + + // 4. Unpack receive buffer + for (size_t i = 0; i < recv_ghost_lids.size(); i++) { + int ghost_id = recv_ghost_lids[i]; + for (int f = 0; f < persistent_num_fields; f++) { + data_ptr[ghost_id * persistent_num_fields + f] = + ghost_recv_buffer[i * persistent_num_fields + f]; + } + } +#else + // Fallback to standard method if MPI-4 not available + exchange_ghosts_neighborhood(data_ptr, persistent_num_fields); +#endif + } + + + /** + * @brief Free persistent neighborhood collective resources + */ + inline void free_persistent_neighborhood() { +#if MPI_VERSION >= 4 + if (has_persistent_neighbor && persistent_neighbor_request != MPI_REQUEST_NULL) { + MPI_Request_free(&persistent_neighbor_request); + persistent_neighbor_request = MPI_REQUEST_NULL; + has_persistent_neighbor = false; + } +#endif + } + +}; + + diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h index 0357b6a6..26dd83c6 100644 --- a/examples/mesh_decomp/decomp_utils.h +++ b/examples/mesh_decomp/decomp_utils.h @@ -19,6 +19,11 @@ #include "scotch.h" #include "ptscotch.h" + + + + + void partition_mesh( Mesh_t& initial_mesh, Mesh_t& final_mesh, @@ -44,7 +49,7 @@ void partition_mesh( int num_nodes_per_elem = 0; - std::vector elements_on_rank; + std::vector nodes_on_rank; @@ -64,14 +69,10 @@ void partition_mesh( std::vector> node_pos_on_rank(world_size); - - if (rank == 0) { num_nodes_per_elem = initial_mesh.num_nodes_in_elem; - // Compute elements to send to each rank; handle remainders for non-even distribution - // Compute elements to send to each rank; handle remainders for non-even distribution std::fill(elems_per_rank.begin(), elems_per_rank.end(), initial_mesh.num_elems / world_size); int remainder = initial_mesh.num_elems % world_size; @@ -80,6 +81,8 @@ void partition_mesh( } } + // Broadcasts the value of num_nodes_per_elem from the root rank (0) to all other ranks in MPI_COMM_WORLD. + // After this call, all ranks will have the same value for num_nodes_per_elem. MPI_Bcast(&num_nodes_per_elem, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); @@ -98,78 +101,74 @@ void partition_mesh( MPI_Barrier(MPI_COMM_WORLD); - // Resize the elements_on_rank vector to hold the received data - elements_on_rank.resize(num_elements_on_rank); - + // Vector of element to send to each rank using a naive partitioning (0-m, m-n, n-o, etc.) + std::vector elements_on_rank(num_elements_on_rank); MPI_Barrier(MPI_COMM_WORLD); double t_scatter_end = MPI_Wtime(); - if(rank == 0) { - std::cout<<" Finished scattering the number of elements to each rank"< all_elements; - std::vector sendcounts(world_size); - std::vector displs(world_size); + std::vector all_elements; // array of all elements to be sent to each rank + std::vector sendcounts(world_size); // array of the number of elements to send to each rank + std::vector displs(world_size); // array of the displacement for each rank in the flattened array - int displacement = 0; + int displacement = 0; // displacement is the starting index of the elements for the current rank in the flattened array for (int i = 0; i < world_size; i++) { - sendcounts[i] = elems_per_rank[i]; - displs[i] = displacement; + sendcounts[i] = elems_per_rank[i]; // number of elements to send to each rank + displs[i] = displacement; // displacement for each rank in the flattened array // Copy elements for rank i to the flattened array for (int j = 0; j < elems_per_rank[i]; j++) { - all_elements.push_back(elements_to_send[i][j]); + all_elements.push_back(elements_to_send[i][j]); // add the elements to the flattened array } - displacement += elems_per_rank[i]; + displacement += elems_per_rank[i]; // increment the displacement by the number of elements to send to the next rank } // Send the elements to each rank + // all_elements.data(): Pointer to the flattened array of all elements to be sent to each rank + // sendcounts.data(): Array with the number of elements to send to each rank + // displs.data(): Array with the displacement for each rank in the flattened array + // MPI_INT: Data type of the elements (integer) + // elements_on_rank.data(): Pointer to the buffer where each rank will receive its elements + // num_elements_on_rank: Number of elements that the receiving rank expects to receive + // MPI_INT: Data type of the receive buffer (integer) + // 0: The root rank (rank 0) that is performing the scatter + // MPI_COMM_WORLD: The communicator MPI_Scatterv(all_elements.data(), sendcounts.data(), displs.data(), MPI_INT, elements_on_rank.data(), num_elements_on_rank, MPI_INT, 0, MPI_COMM_WORLD); } else { + // If the rank is not the root rank, it will receive nullptr for the sendbuf, sendcounts, and displs arrays MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT, elements_on_rank.data(), num_elements_on_rank, MPI_INT, 0, MPI_COMM_WORLD); } + // Wait for all ranks to complete the scatter operation MPI_Barrier(MPI_COMM_WORLD); + + // Timer: End measuring time for scattering element global ids double t_scatter_gids_end = MPI_Wtime(); - if(rank == 0) { + if(rank == 0 && print_info) { std::cout<<" Finished scattering the actual element global ids to each rank"< ranks that ghost it std::unordered_map> gid_to_ghosting_ranks; gid_to_ghosting_ranks.reserve(static_cast(total_ghosts)); @@ -1906,6 +1905,10 @@ void partition_mesh( std::cout.flush(); MPI_Barrier(MPI_COMM_WORLD); + + + + // Optional: print a compact summary of reverse map for verification (limited output) for(int i = 0; i < world_size; i++) { if (rank == i && print_info) { @@ -1929,7 +1932,76 @@ void partition_mesh( std::cout << std::endl; } } - std::cout.flush(); + } + MPI_Barrier(MPI_COMM_WORLD); + } + + + // Add a vector to store boundary element local_ids (those who have ghost destinations across ranks) + std::vector boundary_elem_local_ids; + std::vector> boundary_to_ghost_ranks; // ragged array dimensions (num_boundary_elems, num_ghost_ranks) + + std::set ghost_comm_ranks; // set of ranks that this rank communicates with + + + for (int elem_lid = 0; elem_lid < intermediate_mesh.num_elems; elem_lid++) { + + int local_elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(elem_lid); + if (boundary_elem_targets[elem_lid].empty()) + { + continue; + } + else + { + // Fill in vector of boundary local_ids + boundary_elem_local_ids.push_back(elem_lid); + std::vector ghost_ranks_for_this_boundary_elem; + for (const auto &pr : boundary_elem_targets[elem_lid]) { + ghost_ranks_for_this_boundary_elem.push_back(pr.first); + ghost_comm_ranks.insert(pr.first); + } + boundary_to_ghost_ranks.push_back(ghost_ranks_for_this_boundary_elem); + } + } + + int num_ghost_comm_ranks = ghost_comm_ranks.size(); + std::vector ghost_comm_ranks_vec(num_ghost_comm_ranks); + int i = 0; + for (const auto &r : ghost_comm_ranks) { + ghost_comm_ranks_vec[i] = r; + i++; + } + + + MPI_Barrier(MPI_COMM_WORLD); + + for (int r = 0; r < world_size; ++r) { + MPI_Barrier(MPI_COMM_WORLD); + if (rank == r) { + std::cout << std::endl; + std::cout << "[rank " << rank << "] communicates to ranks: "; + for (int i = 0; i < num_ghost_comm_ranks; ++i) { + std::cout << ghost_comm_ranks_vec[i] << " "; + } + std::cout << std::endl; + } + MPI_Barrier(MPI_COMM_WORLD); + } + + // Print out the boundary element local ids on each rank sequentially + for (int r = 0; r < world_size; ++r) { + MPI_Barrier(MPI_COMM_WORLD); + if (rank == r && print_info) { + std::cout << std::endl; + std::cout << "[rank " << rank << "] Boundary element global ids: " < [ranks that ghost it]. + // 3) For each locally-owned node gid, lookup ranks that ghost it and record targets. + // -------------------------------------------------------------------------------------- + + std::vector>> boundary_node_targets(intermediate_mesh.num_nodes); + + // Prepare local ghost node list as vector + std::vector ghost_node_gids_vec; + ghost_node_gids_vec.reserve(ghost_only_nodes.size()); + for (const auto &g : ghost_only_nodes) ghost_node_gids_vec.push_back(g); + + // Exchange counts + std::vector ghost_node_counts(world_size, 0); + int local_ghost_node_count = static_cast(ghost_node_gids_vec.size()); + MPI_Allgather(&local_ghost_node_count, 1, MPI_INT, ghost_node_counts.data(), 1, MPI_INT, MPI_COMM_WORLD); + + // Displacements and recv buffer + std::vector ghost_node_displs(world_size, 0); + int total_ghost_nodes = 0; + for (int r = 0; r < world_size; ++r) { + ghost_node_displs[r] = total_ghost_nodes; + total_ghost_nodes += ghost_node_counts[r]; + } + std::vector all_ghost_node_gids(total_ghost_nodes); + + // Gather ghost node gids + MPI_Allgatherv(ghost_node_gids_vec.data(), local_ghost_node_count, MPI_UNSIGNED_LONG_LONG, + all_ghost_node_gids.data(), ghost_node_counts.data(), ghost_node_displs.data(), + MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout << " Finished gathering ghost node GIDs" << std::endl; + + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout << " Starting to build the reverse map for node communication" << std::endl; + + // Build map node_gid -> ranks that ghost it + std::unordered_map> node_gid_to_ghosting_ranks; + node_gid_to_ghosting_ranks.reserve(static_cast(total_ghost_nodes)); + for (int r = 0; r < world_size; ++r) { + int cnt = ghost_node_counts[r]; + int off = ghost_node_displs[r]; + for (int i = 0; i < cnt; ++i) { + size_t g = all_ghost_node_gids[off + i]; + node_gid_to_ghosting_ranks[g].push_back(r); + } + } + + // For each local node, list destinations: ranks that ghost our node gid + for (int node_lid = 0; node_lid < intermediate_mesh.num_nodes; node_lid++) { + size_t local_node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid); + auto it = node_gid_to_ghosting_ranks.find(local_node_gid); + if (it == node_gid_to_ghosting_ranks.end()) continue; + const std::vector &dest_ranks = it->second; + for (int rr : dest_ranks) { + if (rr == rank) continue; + boundary_node_targets[node_lid].push_back(std::make_pair(rr, local_node_gid)); + } + } + + std::cout.flush(); + MPI_Barrier(MPI_COMM_WORLD); + print_info = false; + + // Optional: print a compact summary of node reverse map for verification (limited output) + for(int i = 0; i < world_size; i++) { + if (rank == i && print_info) { + std::cout << std::endl; + for (int node_lid = 0; node_lid < intermediate_mesh.num_nodes; node_lid++) { + + size_t local_node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid); + if (boundary_node_targets[node_lid].empty()) + { + std::cout << "[rank " << rank << "] " << "node_lid: "<< node_lid <<" - node_gid: " << local_node_gid << " sends to: no ghost nodes" << std::endl; + } + else + { + std::cout << "[rank " << rank << "] " << "node_lid: "<< node_lid <<" - node_gid: " << local_node_gid << " sends to: "; + int shown = 0; + for (const auto &pr : boundary_node_targets[node_lid]) { + if (shown >= 12) { std::cout << " ..."; break; } + std::cout << "(r" << pr.first << ":gid " << pr.second << ") "; + shown++; + } + std::cout << std::endl; + } + } + std::cout.flush(); + } + MPI_Barrier(MPI_COMM_WORLD); + } + + print_info = false; + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout << " Finished building node communication reverse map" << std::endl; +} + diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp index bc3e8371..608c3867 100644 --- a/examples/mesh_decomp/mesh_decomp.cpp +++ b/examples/mesh_decomp/mesh_decomp.cpp @@ -34,7 +34,7 @@ int main(int argc, char** argv) { // Mesh size double origin[3] = {0.0, 0.0, 0.0}; double length[3] = {1.0, 1.0, 1.0}; - int num_elems_dim[3] = {100, 100, 100}; + int num_elems_dim[3] = {50, 50, 50}; // Initial mesh built on rank zero Mesh_t initial_mesh; @@ -65,6 +65,7 @@ int main(int argc, char** argv) { // write_vtk(intermediate_mesh, intermediate_node, rank); MPI_Barrier(MPI_COMM_WORLD); write_vtu(final_mesh, final_node, rank, MPI_COMM_WORLD); + // write_vtk(final_mesh, final_node, rank); MPI_Barrier(MPI_COMM_WORLD); // Stop timer and get execution time diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h index 10d8838f..f0801777 100644 --- a/examples/mesh_decomp/mesh_io.h +++ b/examples/mesh_decomp/mesh_io.h @@ -515,7 +515,7 @@ void write_vtu(Mesh_t& mesh, const int num_cell_vec_vars = 0; const int num_cell_tensor_vars = 0; - const int num_point_scalar_vars = 2; + const int num_point_scalar_vars = 3; const int num_point_vec_vars = 1; // Scalar values associated with a cell @@ -524,7 +524,7 @@ void write_vtu(Mesh_t& mesh, }; const char point_scalar_var_names[num_point_scalar_vars][15] = { - "rank_id", "elems_in_node" + "rank_id", "elems_in_node", "global_node_id" }; const char point_vec_var_names[num_point_vec_vars][15] = { @@ -557,6 +557,7 @@ void write_vtu(Mesh_t& mesh, point_scalar_fields(node_gid, 0) = rank; point_scalar_fields(node_gid, 1) = (double)mesh.num_corners_in_node(node_gid); + point_scalar_fields(node_gid, 2) = (double)mesh.local_to_global_node_mapping.host(node_gid); } // File management From 285460bd3bc37c186a11f76c8ff00f659d3be1c8 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Mon, 3 Nov 2025 15:27:51 -0600 Subject: [PATCH 16/52] ENH: Creating MPI types, WIP --- examples/mesh_decomp/communication_plan.h | 302 +++--------------- examples/mesh_decomp/decomp_utils.h | 18 ++ examples/mesh_decomp/mpi_type.h | 360 ++++++++++++++++++++++ examples/mesh_decomp/state.h | 2 + 4 files changed, 419 insertions(+), 263 deletions(-) create mode 100644 examples/mesh_decomp/mpi_type.h diff --git a/examples/mesh_decomp/communication_plan.h b/examples/mesh_decomp/communication_plan.h index 83d2cb46..32833e1a 100644 --- a/examples/mesh_decomp/communication_plan.h +++ b/examples/mesh_decomp/communication_plan.h @@ -9,12 +9,6 @@ * node.velocity.comm() -> automatically syncs ghost nodes * elem.density.comm() -> automatically syncs ghost elements * - * Memory layout philosophy: - * - Only std::vector (int, size_t, double) - * - CSR-style indexing for variable-length per-rank data - * - No std::map, std::set, std::pair, or nested containers - * - Pre-allocated MPI buffers to avoid repeated allocations - * - Separate element and node communication plans */ struct CommunicationPlan { @@ -25,30 +19,26 @@ // --- Ghost Send Plan: Owned elements/nodes -> destination ranks --- (Works for both elements and nodes) int num_send_ranks; // Number of destination ranks - std::vector send_rank_ids; // [size: num_send_ranks] Destination rank IDs - std::vector send_ghost_offsets; // [size: num_send_ranks+1] CSR offsets into send_ghost_lids - std::vector send_ghost_lids; // [size: total_send_ghosts] Local IDs of owned elements/nodes to send + DCArrayKokkos send_rank_ids; // [size: num_send_ranks] Destination rank IDs + DCArrayKokkos send_ghost_offsets; // [size: num_send_ranks+1] CSR offsets into send_ghost_lids + DCArrayKokkos send_ghost_lids; // [size: total_send_ghosts] Local IDs of owned elements/nodes to send std::vector send_ghost_gids; // [size: total_send_ghosts] Global IDs (for debug/validation) // --- Ghost Receive Plan: Ghost elements/nodes <- source ranks --- (Works for both elements and nodes) int num_recv_ranks; // Number of source ranks - std::vector recv_rank_ids; // [size: num_recv_ranks] Source rank IDs - std::vector recv_ghost_offsets; // [size: num_recv_ranks+1] CSR offsets into recv_ghost_lids - std::vector recv_ghost_lids; // [size: total_recv_ghosts] Local IDs of ghost elements/nodes (>= num_owned) + DCArrayKokkos recv_rank_ids; // [size: num_recv_ranks] Source rank IDs + DCArrayKokkos recv_ghost_offsets; // [size: num_recv_ranks+1] CSR offsets into recv_ghost_lids + DCArrayKokkos recv_ghost_lids; // [size: total_recv_ghosts] Local IDs of ghost elements/nodes (>= num_owned) std::vector recv_ghost_gids; // [size: total_recv_ghosts] Global IDs - // --- MPI Communication Buffers (pre-allocated, reusable) --- - std::vector ghost_send_buffer; // Flat buffer for ghost data - std::vector ghost_recv_buffer; // Flat buffer for ghost data - - std::vector send_requests; // Request handles for sends - std::vector recv_requests; // Request handles for receives - std::vector mpi_statuses; // Status array for MPI_Waitall + DCArrayKokkos send_requests; // Request handles for sends + DCArrayKokkos recv_requests; // Request handles for receives + DCArrayKokkos mpi_statuses; // Status array for MPI_Waitall // --- Persistent communication (optional optimization) --- - std::vector persistent_send_requests; - std::vector persistent_recv_requests; + DCArrayKokkos persistent_send_requests; + DCArrayKokkos persistent_recv_requests; bool has_persistent_comm; @@ -57,10 +47,10 @@ bool has_graph_comm; // Whether graph communicator is initialized // Counts and displacements for MPI_Neighbor_alltoallv - std::vector send_counts; // [num_send_ranks] Number of items to send per neighbor - std::vector send_displs; // [num_send_ranks] Displacements in send buffer - std::vector recv_counts; // [num_recv_ranks] Number of items to recv per neighbor - std::vector recv_displs; // [num_recv_ranks] Displacements in recv buffer + DCArrayKokkos send_counts; // [num_send_ranks] Number of items to send per neighbor + DCArrayKokkos send_displs; // [num_send_ranks] Displacements in send buffer + DCArrayKokkos recv_counts; // [num_recv_ranks] Number of items to recv per neighbor + DCArrayKokkos recv_displs; // [num_recv_ranks] Displacements in recv buffer // --- Persistent Neighborhood Collectives (MPI-4.0+) --- MPI_Request persistent_neighbor_request; // Persistent request for neighborhood collective @@ -96,245 +86,31 @@ } - /** - * @brief Build communication plan from mesh with flat array inputs - * @param mesh Reference to partitioned mesh (with ghost elements/nodes) - * @param world_size Number of MPI ranks - * @param my_rank Current MPI rank ID - * @param boundary_ghost_dest_ranks Flat array of destination ranks for boundary elements [size: sum of neighbors] - * @param boundary_ghsot_dest_offsets CSR offsets: boundary_ghost_dest_offsets[elem_lid] = start index in boundary_ghost_dest_ranks - * @param boundary_ghost_dest_gids Flat array of global ghost IDs to send [size: sum of neighbors] - * @param all_ghost_gids All ghost global IDs across all ranks - * @param all_ghost_owner_ranks Owner rank for each ghost GID - * - * This build() function takes only flat arrays as input (no std::map, std::set, std::pair). - * The caller must pre-process the mesh data into flat CSR-style arrays. - * - * Implementation: - * 1. Group sends/receives by rank using flat arrays and CSR indexing - * 2. Pre-allocate all MPI buffers - * 3. Store everything in contiguous memory - */ - void build( - const Mesh_t& mesh, - int world_size, - int my_rank, - const int* boundary_ghost_dest_ranks, // Flat array of dest ranks - const int* boundary_ghost_dest_offsets, // CSR offsets [size: num_owned_ghosts+1] - const size_t* boundary_ghost_dest_gids, // Flat array of ghost GIDs - const size_t* all_ghost_gids, // All ghost GIDs - const int* all_ghost_owner_ranks, // Owner ranks indexed by GID - ); - - - // ======================================================================== - // COMMUNICATION INTERFACE - FOR DISTRIBUTED DATA STRUCTURES - // ======================================================================== - - /** - * @brief Pack and exchange data with automatic ghost synchronization - * @param data_ptr Pointer to data array [size: num_total_items * stride] - * @param num_fields Number of fields per item (stride) - * @param item_type 0=elements, 1=nodes - * @param comm MPI communicator - * @param blocking If true, waits for completion before returning - * - * This is the main interface for distributed structures like: - * node.velocity.comm() internally calls: - * comm_plan.communicate(node.velocity.data(), 3, 1, MPI_COMM_WORLD, true) - */ - void communicate(double* data_ptr, int num_fields, int item_type, - MPI_Comm comm = MPI_COMM_WORLD, bool blocking = true); - - - /** - * @brief Non-blocking version: initiate communication - * Returns immediately; user must call wait_communication() - */ - void communicate_begin(double* data_ptr, int num_fields, int item_type, - MPI_Comm comm = MPI_COMM_WORLD); - - - /** - * @brief Wait for non-blocking communication to complete - */ - void wait_communication(double* data_ptr, int num_fields, int item_type); - - - // ======================================================================== - // LOW-LEVEL PACK/UNPACK (for manual control) - // ======================================================================== - - /** - * @brief Pack element data from contiguous array into send buffer - * @param data_ptr Pointer to element data [size: num_total_elems * num_fields] - * @param num_fields Stride (fields per element) - * - * Packs data in layout: [elem0_field0, elem0_field1, ..., elem1_field0, ...] - */ - void pack_ghosts(const double* data_ptr, int num_fields, int field_dimension); - - - /** - * @brief Unpack received element data into ghost elements - */ - void unpack_ghosts(double* data_ptr, int num_fields, int field_dimension); - - - - // ======================================================================== - // MPI EXCHANGE PRIMITIVES - // ======================================================================== - - /** - * @brief Execute MPI_Isend/Irecv for elements - */ - void exchange_ghosts_begin(int num_fields, int field_dimension, MPI_Comm comm = MPI_COMM_WORLD); - - - /** - * @brief Wait for element exchange to complete - */ - void exchange_ghosts_wait(); - - - - // ======================================================================== - // PERSISTENT COMMUNICATION (OPTIMIZATION) - // ======================================================================== - - /** - * @brief Setup persistent MPI communication handles (one-time setup) - * Call once after build(), then use start_persistent/wait_persistent - */ - void init_persistent(int elem_fields, int node_fields, MPI_Comm comm = MPI_COMM_WORLD); - - - /** - * @brief Start persistent send/recv (must call pack_* first) - */ - void start_persistent(); - - - /** - * @brief Wait for persistent communication (then call unpack_*) - */ - void wait_persistent(); - - - /** - * @brief Free persistent communication handles - */ - void free_persistent(); - - - // ======================================================================== - // NEIGHBORHOOD COLLECTIVES (MPI-3.0+) - // ======================================================================== - - /** - * @brief Create distributed graph communicator from communication pattern - * - * Call this ONCE after populating send_rank_ids and recv_rank_ids. - * The graph communicator encodes the sparse communication topology and is - * reused for all subsequent neighborhood collective calls. - * - * @param base_comm Base communicator (usually MPI_COMM_WORLD) - * - * Example from your output: - * rank 0 sends to: {2, 3, 4, 10, 11} - * rank 0 receives from: {computed from ghost ownership} - * - * This creates a directed graph where edges represent communication channels. - * MPI can optimize routing and minimize network contention. - * - * Requirements: MPI-3.0+ (2012) - */ - void create_graph_communicator(MPI_Comm base_comm = MPI_COMM_WORLD); - - - /** - * @brief Exchange ghost data using MPI_Neighbor_alltoallv - * - * Uses the pre-created graph communicator for efficient sparse communication. - * This is cleaner than manual Isend/Irecv loops and allows MPI to optimize. - * - * @param data_ptr Pointer to data array [size: num_total_items * num_fields] - * @param num_fields Number of fields per item (e.g., 3 for velocity) - * - * Workflow: - * 1. Pack owned items into send buffer - * 2. Call MPI_Neighbor_alltoallv (blocking but fast with graph_comm) - * 3. Unpack ghost items from receive buffer - * - * The graph_comm is reused each call - only pack/unpack overhead per timestep. - * - * Requirements: Must call create_graph_communicator() once before using this. - */ - void exchange_ghosts_neighborhood(double* data_ptr, int num_fields); - - - /** - * @brief Initialize persistent neighborhood collective (MPI-4.0+) - * - * Creates a persistent MPI request that pre-allocates all internal buffers - * and communication paths. Provides maximum performance for repeated exchanges - * with the same num_fields. - * - * @param num_fields Number of fields per item (must be same for all timesteps) - * - * Call once during setup: - * comm_plan.create_graph_communicator(MPI_COMM_WORLD); - * comm_plan.init_persistent_neighborhood(3); // For 3D velocity - * - * Then use exchange_ghosts_persistent() each timestep. - * - * Requirements: MPI-4.0+ (2021). Check with: mpirun --version - */ - void init_persistent_neighborhood(int num_fields); - - - /** - * @brief Exchange ghosts using persistent neighborhood collective (FASTEST) - * - * Must call init_persistent_neighborhood() once before using this. - * This is the fastest ghost exchange method for fixed communication patterns. - * - * @param data_ptr Pointer to data array [size: num_total_items * num_fields] - * - * Workflow: - * 1. Pack data into same send buffer used during init - * 2. MPI_Start() - extremely fast, no setup overhead - * 3. MPI_Wait() - wait for completion - * 4. Unpack from receive buffer - * - * Typical speedup vs standard neighborhood: 1.2-1.5x - * - * Note: Falls back to exchange_ghosts_neighborhood() if MPI-4 unavailable. - */ - void exchange_ghosts_persistent(double* data_ptr); - - - /** - * @brief Free persistent neighborhood collective resources - * - * Call at end of simulation to release MPI resources. - * Automatically called by destructor if not explicitly freed. - */ - void free_persistent_neighborhood(); - - - // ======================================================================== - // UTILITIES - // ======================================================================== - - void print_summary(int rank) const; - bool validate(MPI_Comm comm = MPI_COMM_WORLD) const; - size_t send_volume(int elem_fields, int node_fields) const; - size_t recv_volume(int elem_fields, int node_fields) const; - bool needs_communication() const; - int num_neighbor_ranks() const; + void initialize(int num_send_ranks, int num_recv_ranks){ + this->num_send_ranks = num_send_ranks; + this->num_recv_ranks = num_recv_ranks; + + send_rank_ids = DCArrayKokkos(num_send_ranks, "send_rank_ids"); + recv_rank_ids = DCArrayKokkos(num_recv_ranks, "recv_rank_ids"); + send_ghost_offsets = DCArrayKokkos(num_send_ranks + 1, "send_ghost_offsets"); + recv_ghost_offsets = DCArrayKokkos(num_recv_ranks + 1, "recv_ghost_offsets"); + send_ghost_lids = DCArrayKokkos(total_send_ghosts, "send_ghost_lids"); + recv_ghost_lids = DCArrayKokkos(total_recv_ghosts, "recv_ghost_lids"); + send_ghost_gids = std::vector(total_send_ghosts, "send_ghost_gids"); + recv_ghost_gids = std::vector(total_recv_ghosts, "recv_ghost_gids"); + send_requests = DCArrayKokkos(total_send_ghosts, "send_requests"); + recv_requests = DCArrayKokkos(total_recv_ghosts, "recv_requests"); + mpi_statuses = DCArrayKokkos(total_send_ghosts + total_recv_ghosts, "mpi_statuses"); + persistent_send_requests = DCArrayKokkos(total_send_ghosts, "persistent_send_requests"); + persistent_recv_requests = DCArrayKokkos(total_recv_ghosts, "persistent_recv_requests"); + send_counts = DCArrayKokkos(num_send_ranks, "send_counts"); + send_displs = DCArrayKokkos(num_send_ranks, "send_displs"); + recv_counts = DCArrayKokkos(num_recv_ranks, "recv_counts"); + recv_displs = DCArrayKokkos(num_recv_ranks, "recv_displs"); + + } + // ======================================================================== // INLINE IMPLEMENTATIONS - NEIGHBORHOOD COLLECTIVES diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h index 26dd83c6..9c4267bf 100644 --- a/examples/mesh_decomp/decomp_utils.h +++ b/examples/mesh_decomp/decomp_utils.h @@ -15,6 +15,9 @@ #include "state.h" #include "mesh_io.h" + +#include "communication_plan.h" + // Include Scotch headers #include "scotch.h" #include "ptscotch.h" @@ -2013,6 +2016,11 @@ void partition_mesh( + // Build communication plans for elements + CommunicationPlan element_comm_plan; + + + element_comm_plan.initialize(num_send_ranks, num_recv_ranks); @@ -2122,6 +2130,16 @@ void partition_mesh( if(rank == 0) std::cout << " Finished building node communication reverse map" << std::endl; + + + // Build communication plans for elements and nodes + CommunicationPlan element_comm_plan; + CommunicationPlan node_comm_plan; + + element_comm_plan.build(intermediate_mesh, world_size, rank, boundary_elem_targets, boundary_elem_local_ids, boundary_to_ghost_ranks); + node_comm_plan.build(intermediate_mesh, world_size, rank, boundary_node_targets, boundary_node_local_ids, boundary_to_ghost_ranks); + + } diff --git a/examples/mesh_decomp/mpi_type.h b/examples/mesh_decomp/mpi_type.h new file mode 100644 index 00000000..5ba78be9 --- /dev/null +++ b/examples/mesh_decomp/mpi_type.h @@ -0,0 +1,360 @@ +#ifndef MPIDARRAYKOKKOS_H +#define MPIDARRAYKOKKOS_H + +#include "matar.h" +#include "communication_plan.h" + +using namespace mtr; + +///////////////////////// +// MPIDArrayKokkos: Dual type for managing distributed data on both CPU and GPU. +// +// Enhanced with automatic ghost synchronization via CommunicationPlan. +// Allocates space for owned + ghost items and provides communicate() method. +// +// Usage: +// node.coords.communicate() -> syncs ghost nodes automatically +// elem.density.communicate() -> syncs ghost elements automatically +///////////////////////// +template +class MPIDArrayKokkos { + + // this is manage + using TArray1D = Kokkos::DualView ; + +protected: + size_t dims_[7]; + size_t length_; + size_t order_; // tensor order (rank) + int mpi_recv_rank_; + int mpi_tag_; + MPI_Comm mpi_comm_; + MPI_Status mpi_status_; + MPI_Datatype mpi_datatype_; + MPI_Request mpi_request_; + TArray1D this_array_; + + // --- Ghost Communication Support --- + CommunicationPlan* comm_plan_; // Pointer to shared communication plan + size_t num_owned_items_; // Number of owned items (nodes/elements) + size_t num_total_items_; // Total items including ghosts (owned + ghost) + size_t num_fields_; // Fields per item (e.g., 3 for 3D coordinates) + + void set_mpi_type(); + +public: + // Data member to access host view + ViewCArray host; + + MPIDArrayKokkos(); + + MPIDArrayKokkos(size_t dim0, const std::string& tag_string = DEFAULTSTRINGARRAY); + + MPIDArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY); + + MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY); + + MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, + size_t dim3, const std::string& tag_string = DEFAULTSTRINGARRAY); + + MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, + size_t dim3, size_t dim4, const std::string& tag_string = DEFAULTSTRINGARRAY); + + MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, + size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string = DEFAULTSTRINGARRAY); + + MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, + size_t dim3, size_t dim4, size_t dim5, + size_t dim6, const std::string& tag_string = DEFAULTSTRINGARRAY); + + + // ======================================================================== + // DISTRIBUTED COMMUNICATION METHODS (NEW) + // ======================================================================== + + /** + * @brief Set communication plan and ghost metadata + * + * Call this ONCE after allocating the array to enable ghost communication. + * Multiple fields can share the same CommunicationPlan pointer. + * + * @param plan Pointer to shared CommunicationPlan (node or element plan) + * @param num_owned Number of owned items on this rank + * @param num_total Total items including ghosts (owned + ghost) + * + * Example: + * node.coords = MPIDArrayKokkos(num_total_nodes, 3); + * node.coords.set_communication_plan(&node_comm_plan, num_owned_nodes, num_total_nodes); + */ + void set_communication_plan(CommunicationPlan* plan, size_t num_owned, size_t num_total); + + + /** + * @brief Synchronize ghost data using neighborhood collectives + * + * Automatically exchanges boundary → ghost data for this field. + * Uses the CommunicationPlan provided via set_communication_plan(). + * + * Workflow: + * 1. Updates host data from device (if needed) + * 2. Packs owned boundary items + * 3. Calls MPI_Neighbor_alltoallv (via comm_plan) + * 4. Unpacks into ghost items + * 5. Updates device with new ghost data + * + * Example usage: + * // Update owned nodes + * for (int i = 0; i < num_owned_nodes; i++) { + * node.coords(i, 0) += dt * velocity(i, 0); + * } + * + * // Sync ghosts + * node.coords.communicate(); + * + * // Now ghost data is current + */ + void communicate(); + + + /** + * @brief Non-blocking version: start ghost exchange + * + * For advanced users who want to overlap computation with communication. + * Must call communicate_wait() before accessing ghost data. + */ + void communicate_begin(); + + + /** + * @brief Wait for non-blocking ghost exchange to complete + */ + void communicate_wait(); + + + /** + * @brief Get number of owned items (excludes ghosts) + */ + KOKKOS_INLINE_FUNCTION + size_t num_owned() const { return num_owned_items_; } + + + /** + * @brief Get total items including ghosts + */ + KOKKOS_INLINE_FUNCTION + size_t num_total() const { return num_total_items_; } + + + /** + * @brief Check if ghost communication is configured + */ + bool has_communication_plan() const { return comm_plan_ != nullptr; } + + // These functions can setup the data needed for halo send/receives + // Not necessary for standard MPI comms + void mpi_setup(); + + void mpi_setup(int recv_rank); + + void mpi_setup(int recv_rank, int tag); + + void mpi_setup(int recv_rank, int tag, MPI_Comm comm); + + void mpi_set_rank(int recv_rank); + + void mpi_set_tag(int tag); + + void mpi_set_comm(MPI_Comm comm); + + int get_rank(); + + int get_tag(); + + MPI_Comm get_comm(); + + KOKKOS_INLINE_FUNCTION + T& operator()(size_t i) const; + + KOKKOS_INLINE_FUNCTION + T& operator()(size_t i, size_t j) const; + + KOKKOS_INLINE_FUNCTION + T& operator()(size_t i, size_t j, size_t k) const; + + KOKKOS_INLINE_FUNCTION + T& operator()(size_t i, size_t j, size_t k, size_t l) const; + + KOKKOS_INLINE_FUNCTION + T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m) const; + + KOKKOS_INLINE_FUNCTION + T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m, + size_t n) const; + + KOKKOS_INLINE_FUNCTION + T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m, + size_t n, size_t o) const; + + KOKKOS_INLINE_FUNCTION + MPIDArrayKokkos& operator=(const MPIDArrayKokkos& temp); + + // GPU Method + // Method that returns size + KOKKOS_INLINE_FUNCTION + size_t size() const; + + // Host Method + // Method that returns size + KOKKOS_INLINE_FUNCTION + size_t extent() const; + + KOKKOS_INLINE_FUNCTION + size_t dims(size_t i) const; + + KOKKOS_INLINE_FUNCTION + size_t order() const; + + // Method returns the raw device pointer of the Kokkos DualView + KOKKOS_INLINE_FUNCTION + T* device_pointer() const; + + // Method returns the raw host pointer of the Kokkos DualView + KOKKOS_INLINE_FUNCTION + T* host_pointer() const; + + // Method returns kokkos dual view + KOKKOS_INLINE_FUNCTION + TArray1D get_kokkos_dual_view() const; + + // Method that update host view + void update_host(); + + // Method that update device view + void update_device(); + + + + // Deconstructor + virtual KOKKOS_INLINE_FUNCTION + ~MPIDArrayKokkos (); +}; // End of MPIDArrayKokkos + + +// ============================================================================ +// INLINE IMPLEMENTATIONS - DISTRIBUTED COMMUNICATION +// ============================================================================ + +/** + * @brief Default constructor - initialize ghost communication members + */ +template +KOKKOS_INLINE_FUNCTION +MPIDArrayKokkos::MPIDArrayKokkos() + : comm_plan_(nullptr), + num_owned_items_(0), + num_total_items_(0), + num_fields_(0) +{ + // Base constructor handles array initialization +} + + +/** + * @brief Set communication plan and ghost metadata + */ +template +inline void MPIDArrayKokkos::set_communication_plan( + CommunicationPlan* plan, + size_t num_owned, + size_t num_total) +{ + comm_plan_ = plan; + num_owned_items_ = num_owned; + num_total_items_ = num_total; + + // Infer number of fields from array dimensions + // Assumption: dim0 = num_items, dim1+ = fields + if (order_ == 1) { + num_fields_ = 1; // Scalar field + } else if (order_ == 2) { + num_fields_ = dims_[1]; // Vector field (e.g., coords[num_nodes, 3]) + } else { + // For higher order tensors, treat everything after dim0 as fields + num_fields_ = 1; + for (size_t i = 1; i < order_; i++) { + num_fields_ *= dims_[i]; + } + } + + // Validate dimensions match total items + if (dims_[0] != num_total) { + std::cerr << "Error: Array dim0 (" << dims_[0] << ") does not match num_total (" + << num_total << ")" << std::endl; + std::cerr << " Array must be allocated with size = num_owned + num_ghost" << std::endl; + } +} + + +/** + * @brief Synchronize ghost data using neighborhood collectives + */ +template +inline void MPIDArrayKokkos::communicate() +{ + if (!comm_plan_) { + std::cerr << "Error: CommunicationPlan not set. Call set_communication_plan() first." << std::endl; + return; + } + + if (!comm_plan_->has_graph_comm) { + std::cerr << "Error: Graph communicator not initialized in CommunicationPlan." << std::endl; + std::cerr << " Call comm_plan.create_graph_communicator() first." << std::endl; + return; + } + + // 1. Update host from device (ensure data is current on CPU for MPI) + this->update_host(); + + // 2. Get raw pointer to data + T* data_ptr = this->host_pointer(); + + // 3. Convert to double* for MPI communication + // TODO: Support other types (int, float, etc.) with template specialization + static_assert(std::is_same::value, + "Currently only double supported for ghost communication"); + + double* double_ptr = reinterpret_cast(data_ptr); + + // 4. Call neighborhood collective exchange + comm_plan_->exchange_ghosts_neighborhood(double_ptr, static_cast(num_fields_)); + + // 5. Update device with new ghost data + this->update_device(); +} + + +/** + * @brief Non-blocking version: start ghost exchange + */ +template +inline void MPIDArrayKokkos::communicate_begin() +{ + // TODO: Implement non-blocking version using Isend/Irecv + // For now, just call blocking version + std::cerr << "Warning: communicate_begin() not yet implemented, using blocking communicate()" << std::endl; + communicate(); +} + + +/** + * @brief Wait for non-blocking ghost exchange to complete + */ +template +inline void MPIDArrayKokkos::communicate_wait() +{ + // TODO: Implement non-blocking version + // For now, this is a no-op since communicate_begin() is blocking +} + + +#endif // MPIDARRAYKOKKOS_H diff --git a/examples/mesh_decomp/state.h b/examples/mesh_decomp/state.h index 7a1cb676..8afb9abf 100644 --- a/examples/mesh_decomp/state.h +++ b/examples/mesh_decomp/state.h @@ -55,6 +55,8 @@ enum class node_state ///////////////////////////////////////////////////////////////////////////// struct node_t { + + // Replace with MPIDCArrayKokkos DCArrayKokkos coords; ///< Nodal coordinates DCArrayKokkos coords_n0; ///< Nodal coordinates at tn=0 of time integration From eb377938f54e5456d1eab13c72cd22d6c63df528 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Tue, 4 Nov 2025 16:38:56 -0600 Subject: [PATCH 17/52] ENH: Testing Neighbor comms, WIP --- examples/mesh_decomp/communication_plan.h | 572 ++++++++++----------- examples/mesh_decomp/decomp_utils.h | 489 ++++++++++++++++-- examples/mesh_decomp/mesh.h | 10 +- examples/mesh_decomp/mesh_decomp.cpp | 6 +- examples/mesh_decomp/mesh_io.h | 6 +- examples/mesh_decomp/mpi_type.h | 588 +++++++++++----------- examples/mesh_decomp/state.h | 8 +- 7 files changed, 1058 insertions(+), 621 deletions(-) diff --git a/examples/mesh_decomp/communication_plan.h b/examples/mesh_decomp/communication_plan.h index 32833e1a..7c6f9ecb 100644 --- a/examples/mesh_decomp/communication_plan.h +++ b/examples/mesh_decomp/communication_plan.h @@ -1,348 +1,348 @@ -/** - * @struct CommunicationPlan - * @brief Manages efficient MPI communication for ghost element and node data exchange - * - * Pure data-oriented design with only flat, contiguous arrays for maximum cache efficiency. - * Designed to be embedded in distributed data structures for automatic ghost synchronization. - * - * Usage pattern in distributed structures: - * node.velocity.comm() -> automatically syncs ghost nodes - * elem.density.comm() -> automatically syncs ghost elements - * - */ - struct CommunicationPlan { +// /** +// * @struct CommunicationPlan +// * @brief Manages efficient MPI communication for ghost element and node data exchange +// * +// * Pure data-oriented design with only flat, contiguous arrays for maximum cache efficiency. +// * Designed to be embedded in distributed data structures for automatic ghost synchronization. +// * +// * Usage pattern in distributed structures: +// * node.velocity.comm() -> automatically syncs ghost nodes +// * elem.density.comm() -> automatically syncs ghost elements +// * +// */ +// struct CommunicationPlan { - // ======================================================================== - // CORE DATA STRUCTURES - FLAT ARRAYS ONLY - // ======================================================================== +// // ======================================================================== +// // CORE DATA STRUCTURES - FLAT ARRAYS ONLY +// // ======================================================================== - // --- Ghost Send Plan: Owned elements/nodes -> destination ranks --- (Works for both elements and nodes) - int num_send_ranks; // Number of destination ranks - DCArrayKokkos send_rank_ids; // [size: num_send_ranks] Destination rank IDs - DCArrayKokkos send_ghost_offsets; // [size: num_send_ranks+1] CSR offsets into send_ghost_lids - DCArrayKokkos send_ghost_lids; // [size: total_send_ghosts] Local IDs of owned elements/nodes to send - std::vector send_ghost_gids; // [size: total_send_ghosts] Global IDs (for debug/validation) +// // --- Ghost Send Plan: Owned elements/nodes -> destination ranks --- (Works for both elements and nodes) +// int num_send_ranks; // Number of destination ranks +// DCArrayKokkos send_rank_ids; // [size: num_send_ranks] Destination rank IDs +// DCArrayKokkos send_ghost_offsets; // [size: num_send_ranks+1] CSR offsets into send_ghost_lids +// DCArrayKokkos send_ghost_lids; // [size: total_send_ghosts] Local IDs of owned elements/nodes to send +// std::vector send_ghost_gids; // [size: total_send_ghosts] Global IDs (for debug/validation) - // --- Ghost Receive Plan: Ghost elements/nodes <- source ranks --- (Works for both elements and nodes) - int num_recv_ranks; // Number of source ranks - DCArrayKokkos recv_rank_ids; // [size: num_recv_ranks] Source rank IDs - DCArrayKokkos recv_ghost_offsets; // [size: num_recv_ranks+1] CSR offsets into recv_ghost_lids - DCArrayKokkos recv_ghost_lids; // [size: total_recv_ghosts] Local IDs of ghost elements/nodes (>= num_owned) - std::vector recv_ghost_gids; // [size: total_recv_ghosts] Global IDs +// // --- Ghost Receive Plan: Ghost elements/nodes <- source ranks --- (Works for both elements and nodes) +// int num_recv_ranks; // Number of source ranks +// DCArrayKokkos recv_rank_ids; // [size: num_recv_ranks] Source rank IDs +// DCArrayKokkos recv_ghost_offsets; // [size: num_recv_ranks+1] CSR offsets into recv_ghost_lids +// DCArrayKokkos recv_ghost_lids; // [size: total_recv_ghosts] Local IDs of ghost elements/nodes (>= num_owned) +// std::vector recv_ghost_gids; // [size: total_recv_ghosts] Global IDs - DCArrayKokkos send_requests; // Request handles for sends - DCArrayKokkos recv_requests; // Request handles for receives - DCArrayKokkos mpi_statuses; // Status array for MPI_Waitall +// DCArrayKokkos send_requests; // Request handles for sends +// DCArrayKokkos recv_requests; // Request handles for receives +// DCArrayKokkos mpi_statuses; // Status array for MPI_Waitall - // --- Persistent communication (optional optimization) --- - DCArrayKokkos persistent_send_requests; - DCArrayKokkos persistent_recv_requests; - bool has_persistent_comm; +// // --- Persistent communication (optional optimization) --- +// DCArrayKokkos persistent_send_requests; +// DCArrayKokkos persistent_recv_requests; +// bool has_persistent_comm; - // --- Distributed Graph Topology for Neighborhood Collectives --- - MPI_Comm graph_comm; // Graph communicator encoding sparse communication pattern - bool has_graph_comm; // Whether graph communicator is initialized +// // --- Distributed Graph Topology for Neighborhood Collectives --- +// MPI_Comm graph_comm; // Graph communicator encoding sparse communication pattern +// bool has_graph_comm; // Whether graph communicator is initialized - // Counts and displacements for MPI_Neighbor_alltoallv - DCArrayKokkos send_counts; // [num_send_ranks] Number of items to send per neighbor - DCArrayKokkos send_displs; // [num_send_ranks] Displacements in send buffer - DCArrayKokkos recv_counts; // [num_recv_ranks] Number of items to recv per neighbor - DCArrayKokkos recv_displs; // [num_recv_ranks] Displacements in recv buffer +// // Counts and displacements for MPI_Neighbor_alltoallv +// DCArrayKokkos send_counts; // [num_send_ranks] Number of items to send per neighbor +// DCArrayKokkos send_displs; // [num_send_ranks] Displacements in send buffer +// DCArrayKokkos recv_counts; // [num_recv_ranks] Number of items to recv per neighbor +// DCArrayKokkos recv_displs; // [num_recv_ranks] Displacements in recv buffer - // --- Persistent Neighborhood Collectives (MPI-4.0+) --- - MPI_Request persistent_neighbor_request; // Persistent request for neighborhood collective - bool has_persistent_neighbor; // Whether persistent neighborhood is initialized - int persistent_num_fields; // Fields per item for persistent request +// // --- Persistent Neighborhood Collectives (MPI-4.0+) --- +// MPI_Request persistent_neighbor_request; // Persistent request for neighborhood collective +// bool has_persistent_neighbor; // Whether persistent neighborhood is initialized +// int persistent_num_fields; // Fields per item for persistent request - // ======================================================================== - // CONSTRUCTOR / INITIALIZATION - // ======================================================================== +// // ======================================================================== +// // CONSTRUCTOR / INITIALIZATION +// // ======================================================================== - CommunicationPlan() - : num_send_ranks(0), num_recv_ranks(0), - has_persistent_comm(false), - has_graph_comm(false), - has_persistent_neighbor(false), - graph_comm(MPI_COMM_NULL), - persistent_neighbor_request(MPI_REQUEST_NULL), - persistent_num_fields(0) {} +// CommunicationPlan() +// : num_send_ranks(0), num_recv_ranks(0), +// has_persistent_comm(false), +// has_graph_comm(false), +// has_persistent_neighbor(false), +// graph_comm(MPI_COMM_NULL), +// persistent_neighbor_request(MPI_REQUEST_NULL), +// persistent_num_fields(0) {} - // Destructor to free MPI resources - ~CommunicationPlan() { - // Free persistent neighborhood collective - if (has_persistent_neighbor && persistent_neighbor_request != MPI_REQUEST_NULL) { - MPI_Request_free(&persistent_neighbor_request); - } +// // Destructor to free MPI resources +// ~CommunicationPlan() { +// // Free persistent neighborhood collective +// if (has_persistent_neighbor && persistent_neighbor_request != MPI_REQUEST_NULL) { +// MPI_Request_free(&persistent_neighbor_request); +// } - // Free graph communicator - if (has_graph_comm && graph_comm != MPI_COMM_NULL) { - MPI_Comm_free(&graph_comm); - } - } +// // Free graph communicator +// if (has_graph_comm && graph_comm != MPI_COMM_NULL) { +// MPI_Comm_free(&graph_comm); +// } +// } - void initialize(int num_send_ranks, int num_recv_ranks){ - this->num_send_ranks = num_send_ranks; - this->num_recv_ranks = num_recv_ranks; +// void initialize(int num_send_ranks, int num_recv_ranks){ +// this->num_send_ranks = num_send_ranks; +// this->num_recv_ranks = num_recv_ranks; - send_rank_ids = DCArrayKokkos(num_send_ranks, "send_rank_ids"); - recv_rank_ids = DCArrayKokkos(num_recv_ranks, "recv_rank_ids"); - send_ghost_offsets = DCArrayKokkos(num_send_ranks + 1, "send_ghost_offsets"); - recv_ghost_offsets = DCArrayKokkos(num_recv_ranks + 1, "recv_ghost_offsets"); - send_ghost_lids = DCArrayKokkos(total_send_ghosts, "send_ghost_lids"); - recv_ghost_lids = DCArrayKokkos(total_recv_ghosts, "recv_ghost_lids"); - send_ghost_gids = std::vector(total_send_ghosts, "send_ghost_gids"); - recv_ghost_gids = std::vector(total_recv_ghosts, "recv_ghost_gids"); - send_requests = DCArrayKokkos(total_send_ghosts, "send_requests"); - recv_requests = DCArrayKokkos(total_recv_ghosts, "recv_requests"); - mpi_statuses = DCArrayKokkos(total_send_ghosts + total_recv_ghosts, "mpi_statuses"); - persistent_send_requests = DCArrayKokkos(total_send_ghosts, "persistent_send_requests"); - persistent_recv_requests = DCArrayKokkos(total_recv_ghosts, "persistent_recv_requests"); - send_counts = DCArrayKokkos(num_send_ranks, "send_counts"); - send_displs = DCArrayKokkos(num_send_ranks, "send_displs"); - recv_counts = DCArrayKokkos(num_recv_ranks, "recv_counts"); - recv_displs = DCArrayKokkos(num_recv_ranks, "recv_displs"); +// send_rank_ids = DCArrayKokkos(num_send_ranks, "send_rank_ids"); +// recv_rank_ids = DCArrayKokkos(num_recv_ranks, "recv_rank_ids"); +// send_ghost_offsets = DCArrayKokkos(num_send_ranks + 1, "send_ghost_offsets"); +// recv_ghost_offsets = DCArrayKokkos(num_recv_ranks + 1, "recv_ghost_offsets"); +// send_ghost_lids = DCArrayKokkos(total_send_ghosts, "send_ghost_lids"); +// recv_ghost_lids = DCArrayKokkos(total_recv_ghosts, "recv_ghost_lids"); +// send_ghost_gids = std::vector(total_send_ghosts, "send_ghost_gids"); +// recv_ghost_gids = std::vector(total_recv_ghosts, "recv_ghost_gids"); +// send_requests = DCArrayKokkos(total_send_ghosts, "send_requests"); +// recv_requests = DCArrayKokkos(total_recv_ghosts, "recv_requests"); +// mpi_statuses = DCArrayKokkos(total_send_ghosts + total_recv_ghosts, "mpi_statuses"); +// persistent_send_requests = DCArrayKokkos(total_send_ghosts, "persistent_send_requests"); +// persistent_recv_requests = DCArrayKokkos(total_recv_ghosts, "persistent_recv_requests"); +// send_counts = DCArrayKokkos(num_send_ranks, "send_counts"); +// send_displs = DCArrayKokkos(num_send_ranks, "send_displs"); +// recv_counts = DCArrayKokkos(num_recv_ranks, "recv_counts"); +// recv_displs = DCArrayKokkos(num_recv_ranks, "recv_displs"); - } +// } - // ======================================================================== - // INLINE IMPLEMENTATIONS - NEIGHBORHOOD COLLECTIVES - // ======================================================================== +// // ======================================================================== +// // INLINE IMPLEMENTATIONS - NEIGHBORHOOD COLLECTIVES +// // ======================================================================== - /** - * @brief Create distributed graph communicator from communication pattern - */ - inline void create_graph_communicator(MPI_Comm base_comm) { +// /** +// * @brief Create distributed graph communicator from communication pattern +// */ +// inline void create_graph_communicator(MPI_Comm base_comm) { - if (has_graph_comm) { - std::cerr << "Warning: Graph communicator already created, skipping." << std::endl; - return; - } +// if (has_graph_comm) { +// std::cerr << "Warning: Graph communicator already created, skipping." << std::endl; +// return; +// } - int indegree = num_recv_ranks; // Number of ranks we receive FROM - int outdegree = num_send_ranks; // Number of ranks we send TO +// int indegree = num_recv_ranks; // Number of ranks we receive FROM +// int outdegree = num_send_ranks; // Number of ranks we send TO - // Create the distributed graph communicator - // MPI_Dist_graph_create_adjacent signature: - // (comm_old, indegree, sources[], sourceweights, outdegree, dests[], destweights, - // info, reorder, comm_dist_graph) - int reorder = 0; // Don't reorder ranks (keep same as base_comm) +// // Create the distributed graph communicator +// // MPI_Dist_graph_create_adjacent signature: +// // (comm_old, indegree, sources[], sourceweights, outdegree, dests[], destweights, +// // info, reorder, comm_dist_graph) +// int reorder = 0; // Don't reorder ranks (keep same as base_comm) - MPI_Dist_graph_create_adjacent( - base_comm, // Base communicator - indegree, // We receive from num_recv_ranks neighbors - recv_rank_ids.data(), // Source ranks (we receive from these) - MPI_UNWEIGHTED, // No edge weights for sources - outdegree, // We send to num_send_ranks neighbors - send_rank_ids.data(), // Destination ranks (we send to these) - MPI_UNWEIGHTED, // No edge weights for destinations - MPI_INFO_NULL, // No special hints - reorder, // Don't reorder ranks - &graph_comm // Output: new graph communicator - ); +// MPI_Dist_graph_create_adjacent( +// base_comm, // Base communicator +// indegree, // We receive from num_recv_ranks neighbors +// recv_rank_ids.data(), // Source ranks (we receive from these) +// MPI_UNWEIGHTED, // No edge weights for sources +// outdegree, // We send to num_send_ranks neighbors +// send_rank_ids.data(), // Destination ranks (we send to these) +// MPI_UNWEIGHTED, // No edge weights for destinations +// MPI_INFO_NULL, // No special hints +// reorder, // Don't reorder ranks +// &graph_comm // Output: new graph communicator +// ); - has_graph_comm = true; +// has_graph_comm = true; - // Pre-allocate counts and displacements arrays - send_counts.resize(num_send_ranks); - send_displs.resize(num_send_ranks); - recv_counts.resize(num_recv_ranks); - recv_displs.resize(num_recv_ranks); - } +// // Pre-allocate counts and displacements arrays +// send_counts.resize(num_send_ranks); +// send_displs.resize(num_send_ranks); +// recv_counts.resize(num_recv_ranks); +// recv_displs.resize(num_recv_ranks); +// } - /** - * @brief Exchange ghost data using MPI_Neighbor_alltoallv - */ - inline void exchange_ghosts_neighborhood(double* data_ptr, int num_fields) { +// /** +// * @brief Exchange ghost data using MPI_Neighbor_alltoallv +// */ +// inline void exchange_ghosts_neighborhood(double* data_ptr, int num_fields) { - if (!has_graph_comm) { - std::cerr << "Error: Must call create_graph_communicator() first!" << std::endl; - return; - } +// if (!has_graph_comm) { +// std::cerr << "Error: Must call create_graph_communicator() first!" << std::endl; +// return; +// } - // 1. Pack send buffer from owned items - int total_send = send_ghost_lids.size(); - ghost_send_buffer.resize(total_send * num_fields); +// // 1. Pack send buffer from owned items +// int total_send = send_ghost_lids.size(); +// ghost_send_buffer.resize(total_send * num_fields); - for (size_t i = 0; i < send_ghost_lids.size(); i++) { - int local_id = send_ghost_lids[i]; - for (int f = 0; f < num_fields; f++) { - ghost_send_buffer[i * num_fields + f] = data_ptr[local_id * num_fields + f]; - } - } +// for (size_t i = 0; i < send_ghost_lids.size(); i++) { +// int local_id = send_ghost_lids[i]; +// for (int f = 0; f < num_fields; f++) { +// ghost_send_buffer[i * num_fields + f] = data_ptr[local_id * num_fields + f]; +// } +// } - // 2. Update counts and displacements for this num_fields - for (int i = 0; i < num_send_ranks; i++) { - int start_idx = send_ghost_offsets[i]; - int end_idx = send_ghost_offsets[i + 1]; - send_counts[i] = (end_idx - start_idx) * num_fields; - send_displs[i] = start_idx * num_fields; - } +// // 2. Update counts and displacements for this num_fields +// for (int i = 0; i < num_send_ranks; i++) { +// int start_idx = send_ghost_offsets[i]; +// int end_idx = send_ghost_offsets[i + 1]; +// send_counts[i] = (end_idx - start_idx) * num_fields; +// send_displs[i] = start_idx * num_fields; +// } - int total_recv = recv_ghost_lids.size(); - ghost_recv_buffer.resize(total_recv * num_fields); +// int total_recv = recv_ghost_lids.size(); +// ghost_recv_buffer.resize(total_recv * num_fields); - for (int i = 0; i < num_recv_ranks; i++) { - int start_idx = recv_ghost_offsets[i]; - int end_idx = recv_ghost_offsets[i + 1]; - recv_counts[i] = (end_idx - start_idx) * num_fields; - recv_displs[i] = start_idx * num_fields; - } +// for (int i = 0; i < num_recv_ranks; i++) { +// int start_idx = recv_ghost_offsets[i]; +// int end_idx = recv_ghost_offsets[i + 1]; +// recv_counts[i] = (end_idx - start_idx) * num_fields; +// recv_displs[i] = start_idx * num_fields; +// } - // 3. Execute neighborhood collective (BLOCKING but fast with graph_comm) - // MPI_Neighbor_alltoallv signature: - // (sendbuf, sendcounts[], sdispls[], sendtype, - // recvbuf, recvcounts[], rdispls[], recvtype, comm) - MPI_Neighbor_alltoallv( - ghost_send_buffer.data(), // Send buffer - send_counts.data(), // Send counts per neighbor - send_displs.data(), // Send displacements - MPI_DOUBLE, // Send type - ghost_recv_buffer.data(), // Receive buffer - recv_counts.data(), // Receive counts per neighbor - recv_displs.data(), // Receive displacements - MPI_DOUBLE, // Receive type - graph_comm // Graph communicator (NOT MPI_COMM_WORLD!) - ); +// // 3. Execute neighborhood collective (BLOCKING but fast with graph_comm) +// // MPI_Neighbor_alltoallv signature: +// // (sendbuf, sendcounts[], sdispls[], sendtype, +// // recvbuf, recvcounts[], rdispls[], recvtype, comm) +// MPI_Neighbor_alltoallv( +// ghost_send_buffer.data(), // Send buffer +// send_counts.data(), // Send counts per neighbor +// send_displs.data(), // Send displacements +// MPI_DOUBLE, // Send type +// ghost_recv_buffer.data(), // Receive buffer +// recv_counts.data(), // Receive counts per neighbor +// recv_displs.data(), // Receive displacements +// MPI_DOUBLE, // Receive type +// graph_comm // Graph communicator (NOT MPI_COMM_WORLD!) +// ); - // 4. Unpack receive buffer into ghost items - for (size_t i = 0; i < recv_ghost_lids.size(); i++) { - int ghost_local_id = recv_ghost_lids[i]; - for (int f = 0; f < num_fields; f++) { - data_ptr[ghost_local_id * num_fields + f] = ghost_recv_buffer[i * num_fields + f]; - } - } - } +// // 4. Unpack receive buffer into ghost items +// for (size_t i = 0; i < recv_ghost_lids.size(); i++) { +// int ghost_local_id = recv_ghost_lids[i]; +// for (int f = 0; f < num_fields; f++) { +// data_ptr[ghost_local_id * num_fields + f] = ghost_recv_buffer[i * num_fields + f]; +// } +// } +// } - /** - * @brief Initialize persistent neighborhood collective (MPI-4.0+) - */ - inline void init_persistent_neighborhood(int num_fields) { +// /** +// * @brief Initialize persistent neighborhood collective (MPI-4.0+) +// */ +// inline void init_persistent_neighborhood(int num_fields) { - if (!has_graph_comm) { - std::cerr << "Error: Must call create_graph_communicator() first!" << std::endl; - return; - } +// if (!has_graph_comm) { +// std::cerr << "Error: Must call create_graph_communicator() first!" << std::endl; +// return; +// } - if (has_persistent_neighbor) { - std::cerr << "Warning: Persistent neighborhood already initialized, freeing and re-creating." << std::endl; - free_persistent_neighborhood(); - } +// if (has_persistent_neighbor) { +// std::cerr << "Warning: Persistent neighborhood already initialized, freeing and re-creating." << std::endl; +// free_persistent_neighborhood(); +// } - persistent_num_fields = num_fields; +// persistent_num_fields = num_fields; - // Allocate buffers - int total_send = send_ghost_lids.size(); - int total_recv = recv_ghost_lids.size(); - ghost_send_buffer.resize(total_send * num_fields); - ghost_recv_buffer.resize(total_recv * num_fields); +// // Allocate buffers +// int total_send = send_ghost_lids.size(); +// int total_recv = recv_ghost_lids.size(); +// ghost_send_buffer.resize(total_send * num_fields); +// ghost_recv_buffer.resize(total_recv * num_fields); - // Setup counts and displacements for persistent request - for (int i = 0; i < num_send_ranks; i++) { - int start_idx = send_ghost_offsets[i]; - int end_idx = send_ghost_offsets[i + 1]; - send_counts[i] = (end_idx - start_idx) * num_fields; - send_displs[i] = start_idx * num_fields; - } +// // Setup counts and displacements for persistent request +// for (int i = 0; i < num_send_ranks; i++) { +// int start_idx = send_ghost_offsets[i]; +// int end_idx = send_ghost_offsets[i + 1]; +// send_counts[i] = (end_idx - start_idx) * num_fields; +// send_displs[i] = start_idx * num_fields; +// } - for (int i = 0; i < num_recv_ranks; i++) { - int start_idx = recv_ghost_offsets[i]; - int end_idx = recv_ghost_offsets[i + 1]; - recv_counts[i] = (end_idx - start_idx) * num_fields; - recv_displs[i] = start_idx * num_fields; - } +// for (int i = 0; i < num_recv_ranks; i++) { +// int start_idx = recv_ghost_offsets[i]; +// int end_idx = recv_ghost_offsets[i + 1]; +// recv_counts[i] = (end_idx - start_idx) * num_fields; +// recv_displs[i] = start_idx * num_fields; +// } -#if MPI_VERSION >= 4 - // MPI-4.0+ persistent neighborhood collective - // MPI_Neighbor_alltoallv_init signature (similar to MPI_Neighbor_alltoallv but creates request): - // (sendbuf, sendcounts[], sdispls[], sendtype, - // recvbuf, recvcounts[], rdispls[], recvtype, comm, info, request) - MPI_Neighbor_alltoallv_init( - ghost_send_buffer.data(), send_counts.data(), send_displs.data(), MPI_DOUBLE, - ghost_recv_buffer.data(), recv_counts.data(), recv_displs.data(), MPI_DOUBLE, - graph_comm, - MPI_INFO_NULL, - &persistent_neighbor_request - ); - has_persistent_neighbor = true; -#else - int rank; - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - if (rank == 0) { - std::cerr << "Warning: MPI-4.0 required for persistent neighborhood collectives" << std::endl; - std::cerr << " Detected MPI version: " << MPI_VERSION << "." << MPI_SUBVERSION << std::endl; - std::cerr << " Will fall back to standard neighborhood collective" << std::endl; - } - has_persistent_neighbor = false; -#endif - } +// #if MPI_VERSION >= 4 +// // MPI-4.0+ persistent neighborhood collective +// // MPI_Neighbor_alltoallv_init signature (similar to MPI_Neighbor_alltoallv but creates request): +// // (sendbuf, sendcounts[], sdispls[], sendtype, +// // recvbuf, recvcounts[], rdispls[], recvtype, comm, info, request) +// MPI_Neighbor_alltoallv_init( +// ghost_send_buffer.data(), send_counts.data(), send_displs.data(), MPI_DOUBLE, +// ghost_recv_buffer.data(), recv_counts.data(), recv_displs.data(), MPI_DOUBLE, +// graph_comm, +// MPI_INFO_NULL, +// &persistent_neighbor_request +// ); +// has_persistent_neighbor = true; +// #else +// int rank; +// MPI_Comm_rank(MPI_COMM_WORLD, &rank); +// if (rank == 0) { +// std::cerr << "Warning: MPI-4.0 required for persistent neighborhood collectives" << std::endl; +// std::cerr << " Detected MPI version: " << MPI_VERSION << "." << MPI_SUBVERSION << std::endl; +// std::cerr << " Will fall back to standard neighborhood collective" << std::endl; +// } +// has_persistent_neighbor = false; +// #endif +// } - /** - * @brief Exchange ghosts using persistent neighborhood collective (FASTEST) - */ - inline void exchange_ghosts_persistent(double* data_ptr) { +// /** +// * @brief Exchange ghosts using persistent neighborhood collective (FASTEST) +// */ +// inline void exchange_ghosts_persistent(double* data_ptr) { -#if MPI_VERSION >= 4 - if (!has_persistent_neighbor) { - std::cerr << "Error: Must call init_persistent_neighborhood() first!" << std::endl; - std::cerr << " Falling back to standard neighborhood collective..." << std::endl; - exchange_ghosts_neighborhood(data_ptr, persistent_num_fields); - return; - } +// #if MPI_VERSION >= 4 +// if (!has_persistent_neighbor) { +// std::cerr << "Error: Must call init_persistent_neighborhood() first!" << std::endl; +// std::cerr << " Falling back to standard neighborhood collective..." << std::endl; +// exchange_ghosts_neighborhood(data_ptr, persistent_num_fields); +// return; +// } - // 1. Pack send buffer (same memory location as during init) - for (size_t i = 0; i < send_ghost_lids.size(); i++) { - int local_id = send_ghost_lids[i]; - for (int f = 0; f < persistent_num_fields; f++) { - ghost_send_buffer[i * persistent_num_fields + f] = - data_ptr[local_id * persistent_num_fields + f]; - } - } +// // 1. Pack send buffer (same memory location as during init) +// for (size_t i = 0; i < send_ghost_lids.size(); i++) { +// int local_id = send_ghost_lids[i]; +// for (int f = 0; f < persistent_num_fields; f++) { +// ghost_send_buffer[i * persistent_num_fields + f] = +// data_ptr[local_id * persistent_num_fields + f]; +// } +// } - // 2. Start persistent request (VERY fast - no setup overhead) - MPI_Start(&persistent_neighbor_request); +// // 2. Start persistent request (VERY fast - no setup overhead) +// MPI_Start(&persistent_neighbor_request); - // 3. Wait for completion - MPI_Wait(&persistent_neighbor_request, MPI_STATUS_IGNORE); +// // 3. Wait for completion +// MPI_Wait(&persistent_neighbor_request, MPI_STATUS_IGNORE); - // 4. Unpack receive buffer - for (size_t i = 0; i < recv_ghost_lids.size(); i++) { - int ghost_id = recv_ghost_lids[i]; - for (int f = 0; f < persistent_num_fields; f++) { - data_ptr[ghost_id * persistent_num_fields + f] = - ghost_recv_buffer[i * persistent_num_fields + f]; - } - } -#else - // Fallback to standard method if MPI-4 not available - exchange_ghosts_neighborhood(data_ptr, persistent_num_fields); -#endif - } +// // 4. Unpack receive buffer +// for (size_t i = 0; i < recv_ghost_lids.size(); i++) { +// int ghost_id = recv_ghost_lids[i]; +// for (int f = 0; f < persistent_num_fields; f++) { +// data_ptr[ghost_id * persistent_num_fields + f] = +// ghost_recv_buffer[i * persistent_num_fields + f]; +// } +// } +// #else +// // Fallback to standard method if MPI-4 not available +// exchange_ghosts_neighborhood(data_ptr, persistent_num_fields); +// #endif +// } - /** - * @brief Free persistent neighborhood collective resources - */ - inline void free_persistent_neighborhood() { -#if MPI_VERSION >= 4 - if (has_persistent_neighbor && persistent_neighbor_request != MPI_REQUEST_NULL) { - MPI_Request_free(&persistent_neighbor_request); - persistent_neighbor_request = MPI_REQUEST_NULL; - has_persistent_neighbor = false; - } -#endif - } +// /** +// * @brief Free persistent neighborhood collective resources +// */ +// inline void free_persistent_neighborhood() { +// #if MPI_VERSION >= 4 +// if (has_persistent_neighbor && persistent_neighbor_request != MPI_REQUEST_NULL) { +// MPI_Request_free(&persistent_neighbor_request); +// persistent_neighbor_request = MPI_REQUEST_NULL; +// has_persistent_neighbor = false; +// } +// #endif +// } -}; +// }; diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h index 9c4267bf..752b39e6 100644 --- a/examples/mesh_decomp/decomp_utils.h +++ b/examples/mesh_decomp/decomp_utils.h @@ -16,8 +16,6 @@ #include "mesh_io.h" -#include "communication_plan.h" - // Include Scotch headers #include "scotch.h" #include "ptscotch.h" @@ -32,6 +30,7 @@ void partition_mesh( Mesh_t& final_mesh, node_t& initial_node, node_t& final_node, + GaussPoint_t& gauss_point, int world_size, int rank){ @@ -1675,6 +1674,56 @@ void partition_mesh( extended_lid_to_elem_gid[intermediate_mesh.num_elems + idx] = ghost_elem_gids_ordered[idx]; } + // Build array: for each ghost element, store which rank owns it (where to receive data from) + std::vector ghost_elem_owner_ranks(ghost_elem_gids_ordered.size()); + for (size_t i = 0; i < ghost_elem_gids_ordered.size(); ++i) { + size_t ghost_gid = ghost_elem_gids_ordered[i]; + auto it = elem_gid_to_rank.find(ghost_gid); + if (it != elem_gid_to_rank.end()) { + ghost_elem_owner_ranks[i] = it->second; + } else { + std::cerr << "[rank " << rank << "] ERROR: Ghost element GID " << ghost_gid + << " not found in elem_gid_to_rank map!" << std::endl; + ghost_elem_owner_ranks[i] = -1; // Invalid rank as error indicator + } + } + + // Optional: Print ghost element receive pattern + if (print_info) { + for (int r = 0; r < world_size; ++r) { + MPI_Barrier(MPI_COMM_WORLD); + if (rank == r) { + std::cout << "[rank " << rank << "] Ghost element receive pattern:" << std::endl; + for (size_t i = 0; i < ghost_elem_gids_ordered.size(); ++i) { + size_t ghost_ext_lid = intermediate_mesh.num_elems + i; + std::cout << " Ghost elem ext_lid=" << ghost_ext_lid + << " gid=" << ghost_elem_gids_ordered[i] + << " receives from rank " << ghost_elem_owner_ranks[i] << std::endl; + } + } + MPI_Barrier(MPI_COMM_WORLD); + } + } + + // Create a std::set of all the ranks this rank will receive data from + std::set ghost_elem_receive_ranks; + for (size_t i = 0; i < ghost_elem_gids_ordered.size(); ++i) { + ghost_elem_receive_ranks.insert(ghost_elem_owner_ranks[i]); + } + + + // Print with ranks this rank will receive element data from sequentially + for (int r = 0; r < world_size; ++r) { + MPI_Barrier(MPI_COMM_WORLD); + if (rank == r) { + std::cout << "[rank " << rank << "] Ranks this rank will receive element data from: "; + for (int rank : ghost_elem_receive_ranks) { + std::cout << rank << " "; + } + std::cout << std::endl; + } + MPI_Barrier(MPI_COMM_WORLD); + } // ****************************************************************************************** @@ -1792,6 +1841,7 @@ void partition_mesh( all_owned_gids.data(), owned_counts.data(), owned_displs.data(), MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); + // d) Global coords (size: total_owned x 3) std::vector owned_coords_send(3*local_owned_count, 0.0); for (int i=0; i [ranks that ghost it]. - // 3) For each locally-owned element gid, lookup ranks that ghost it and record targets. - // -------------------------------------------------------------------------------------- +// -------------------------------------------------------------------------------------- +// Build the send patterns for elements +// Build reverse map via global IDs: for each local element gid, find ranks that ghost it. +// Steps: +// 1) Each rank contributes its ghost element GIDs. +// 2) Allgatherv ghost GIDs to build gid -> [ranks that ghost it]. +// 3) For each locally-owned element gid, lookup ranks that ghost it and record targets. +// -------------------------------------------------------------------------------------- std::vector>> boundary_elem_targets(intermediate_mesh.num_elems); // Prepare local ghost list as vector @@ -1909,8 +1960,6 @@ void partition_mesh( std::cout.flush(); MPI_Barrier(MPI_COMM_WORLD); - - // Optional: print a compact summary of reverse map for verification (limited output) for(int i = 0; i < world_size; i++) { @@ -1982,7 +2031,7 @@ void partition_mesh( MPI_Barrier(MPI_COMM_WORLD); if (rank == r) { std::cout << std::endl; - std::cout << "[rank " << rank << "] communicates to ranks: "; + std::cout << "[rank " << rank << "] elements communicates to ranks: "; for (int i = 0; i < num_ghost_comm_ranks; ++i) { std::cout << ghost_comm_ranks_vec[i] << " "; } @@ -1991,6 +2040,8 @@ void partition_mesh( MPI_Barrier(MPI_COMM_WORLD); } + print_info = false; + // Print out the boundary element local ids on each rank sequentially for (int r = 0; r < world_size; ++r) { MPI_Barrier(MPI_COMM_WORLD); @@ -2009,31 +2060,415 @@ void partition_mesh( MPI_Barrier(MPI_COMM_WORLD); } + + final_mesh.num_boundary_elems = boundary_elem_local_ids.size(); + final_mesh.boundary_elem_local_ids = DCArrayKokkos(final_mesh.num_boundary_elems); + for (int i = 0; i < final_mesh.num_boundary_elems; i++) { + final_mesh.boundary_elem_local_ids.host(i) = boundary_elem_local_ids[i]; + } + final_mesh.boundary_elem_local_ids.update_device(); + print_info = false; MPI_Barrier(MPI_COMM_WORLD); +// ****************************************************************************************** +// Create MPI distributed graph communicator for element communication +// ****************************************************************************************** + // MPI_Dist_graph_create_adjacent creates a distributed graph topology communicator + // that efficiently represents the communication pattern between ranks. + // This allows MPI to optimize communication based on the actual connectivity pattern. + + // ---------- Prepare input communicator ---------- + // comm_old: The base communicator from which to create the graph communicator + MPI_Comm comm_old = MPI_COMM_WORLD; + + // ---------- Prepare INCOMING edges (sources) ---------- + // indegree: Number of ranks from which this rank will RECEIVE data + // These are the ranks that own elements which are ghosted on this rank + std::vector ghost_elem_receive_ranks_vec(ghost_elem_receive_ranks.begin(), + ghost_elem_receive_ranks.end()); + // The number of ranks from which this rank will receive data (incoming neighbors) + int indegree = static_cast(ghost_elem_receive_ranks_vec.size()); + + // sources: Array of source rank IDs (ranks we receive from) + // Each element corresponds to a rank that owns elements we ghost + int* sources = (indegree > 0) ? ghost_elem_receive_ranks_vec.data() : MPI_UNWEIGHTED; + + // sourceweights: Weights on incoming edges (not used here, set to MPI_UNWEIGHTED) + // Could be used to specify communication volume if needed for optimization + int* sourceweights = MPI_UNWEIGHTED; + + // ---------- Prepare OUTGOING edges (destinations) ---------- + // outdegree: Number of ranks to which this rank will SEND data + // These are the ranks that ghost elements owned by this rank + int outdegree = num_ghost_comm_ranks; + + // destinations: Array of destination rank IDs (ranks we send to) + // Each element corresponds to a rank that ghosts our owned elements + int* destinations = (outdegree > 0) ? ghost_comm_ranks_vec.data() : MPI_UNWEIGHTED; + + // destweights: Weights on outgoing edges (not used here, set to MPI_UNWEIGHTED) + // Could be used to specify communication volume if needed for optimization + int* destweights = MPI_UNWEIGHTED; + + // ---------- Additional parameters ---------- + // info: Hints for optimization (MPI_INFO_NULL means use defaults) + MPI_Info info = MPI_INFO_NULL; + + // reorder: Whether to allow MPI to reorder ranks for optimization (0=no reordering) + // Setting to 0 preserves original rank numbering + int reorder = 0; + + // ---------- Output communicator ---------- + // graph_comm: The new distributed graph communicator that will be created + MPI_Comm graph_comm; + + // Create the distributed graph communicator + // This call collectively creates a communicator where each rank specifies: + // - Which ranks it receives from (sources/indegree) + // - Which ranks it sends to (destinations/outdegree) + // MPI can then optimize collective operations and point-to-point communication + // based on this connectivity information. + MPI_Dist_graph_create_adjacent( + comm_old, // Input: base communicator + indegree, // Input: number of incoming neighbors (ranks we receive from) + sources, // Input: array of source ranks [indegree elements] + sourceweights, // Input: weights on incoming edges (MPI_UNWEIGHTED) + outdegree, // Input: number of outgoing neighbors (ranks we send to) + destinations, // Input: array of destination ranks [outdegree elements] + destweights, // Input: weights on outgoing edges (MPI_UNWEIGHTED) + info, // Input: optimization hints (MPI_INFO_NULL) + reorder, // Input: allow rank reordering (0=no) + &graph_comm // Output: new distributed graph communicator + ); + + // Optional: Verify the graph communicator was created successfully + if (rank == 0) { + std::cout << " Created MPI distributed graph communicator for element communication" << std::endl; + } + MPI_Barrier(MPI_COMM_WORLD); + + // ============================================================================ + // Verify the distributed graph communicator + // ============================================================================ + // Query the graph to verify it matches what we specified + int indegree_out, outdegree_out, weighted; + MPI_Dist_graph_neighbors_count(graph_comm, &indegree_out, &outdegree_out, &weighted); + + // Allocate arrays to receive neighbor information + std::vector sources_out(indegree_out); + std::vector sourceweights_out(indegree_out); + std::vector destinations_out(outdegree_out); + std::vector destweights_out(outdegree_out); + + // Retrieve the actual neighbors from the graph communicator + MPI_Dist_graph_neighbors(graph_comm, + indegree_out, sources_out.data(), sourceweights_out.data(), + outdegree_out, destinations_out.data(), destweights_out.data()); + + // Print verification information for each rank sequentially + for (int r = 0; r < world_size; ++r) { + MPI_Barrier(MPI_COMM_WORLD); + if (rank == r) { + std::cout << "\n[rank " << rank << "] Graph Communicator Verification:" << std::endl; + std::cout << " Indegree (receives from " << indegree_out << " ranks): "; + for (int i = 0; i < indegree_out; ++i) { + std::cout << sources_out[i] << " "; + } + std::cout << std::endl; + + std::cout << " Outdegree (sends to " << outdegree_out << " ranks): "; + for (int i = 0; i < outdegree_out; ++i) { + std::cout << destinations_out[i] << " "; + } + std::cout << std::endl; + + std::cout << " Weighted: " << (weighted ? "yes" : "no") << std::endl; + } + MPI_Barrier(MPI_COMM_WORLD); + } + + // Additional verification: Check if the queried values match our input + bool verification_passed = true; + if (indegree_out != indegree) { + std::cerr << "[rank " << rank << "] ERROR: indegree mismatch! " + << "Expected " << indegree << ", got " << indegree_out << std::endl; + verification_passed = false; + } + if (outdegree_out != outdegree) { + std::cerr << "[rank " << rank << "] ERROR: outdegree mismatch! " + << "Expected " << outdegree << ", got " << outdegree_out << std::endl; + verification_passed = false; + } + + // Check if source and destination ranks match (order may differ) + std::set sources_set_in(ghost_elem_receive_ranks_vec.begin(), ghost_elem_receive_ranks_vec.end()); + std::set sources_set_out(sources_out.begin(), sources_out.end()); + if (sources_set_in != sources_set_out) { + std::cerr << "[rank " << rank << "] ERROR: source ranks mismatch!" << std::endl; + verification_passed = false; + } + + std::set dests_set_in(ghost_comm_ranks_vec.begin(), ghost_comm_ranks_vec.end()); + std::set dests_set_out(destinations_out.begin(), destinations_out.end()); + if (dests_set_in != dests_set_out) { + std::cerr << "[rank " << rank << "] ERROR: destination ranks mismatch!" << std::endl; + verification_passed = false; + } + + // Global verification check + int local_passed = verification_passed ? 1 : 0; + int global_passed = 0; + MPI_Allreduce(&local_passed, &global_passed, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + if (rank == 0) { + if (global_passed) { + std::cout << "\n✓ Graph communicator verification PASSED on all ranks\n" << std::endl; + } else { + std::cout << "\n✗ Graph communicator verification FAILED on one or more ranks\n" << std::endl; + } + } + MPI_Barrier(MPI_COMM_WORLD); - // Build communication plans for elements - CommunicationPlan element_comm_plan; - element_comm_plan.initialize(num_send_ranks, num_recv_ranks); +// ****************************************************************************************** +// Test element communication using MPI_Neighbor_alltoallv +// ****************************************************************************************** + // Gauss points share the same communication plan as elements. + // This test initializes gauss point fields on owned elements and exchanges them with ghost elements. + + print_info = true; // Enable debug output for communication test + + gauss_point.initialize(final_mesh.num_elems, 1, {gauss_pt_state::fields}); + + // Initialize the gauss point fields on each rank + // Set owned elements to rank number, ghost elements to -1 (to verify communication) + for (int i = 0; i < final_mesh.num_owned_elems; i++) { + gauss_point.fields.host(i) = static_cast(rank); + } + for (int i = final_mesh.num_owned_elems; i < final_mesh.num_elems; i++) { + gauss_point.fields.host(i) = -1.0; // Ghost elements should be updated + } + gauss_point.fields.update_device(); + + // ========== Build send counts and displacements for OUTGOING neighbors (destinations) ========== + // For MPI_Neighbor_alltoallv with graph communicator: + // - elem_sendcounts[i] = number of elements to send to i-th outgoing neighbor (destinations_out[i]) + // - elem_sdispls[i] = starting position in send buffer for i-th outgoing neighbor + + std::vector elem_sendcounts(outdegree_out, 0); + std::vector elem_sdispls(outdegree_out, 0); + + // Count how many boundary elements go to each destination rank + // boundary_elem_targets[elem_lid] contains pairs (dest_rank, elem_gid) for each boundary element + std::map> elems_to_send_by_rank; // rank -> list of boundary element local IDs + + for (int elem_lid = 0; elem_lid < intermediate_mesh.num_elems; elem_lid++) { + if (!boundary_elem_targets[elem_lid].empty()) { + for (const auto &pr : boundary_elem_targets[elem_lid]) { + int dest_rank = pr.first; + elems_to_send_by_rank[dest_rank].push_back(elem_lid); + } + } + } + + // Fill elem_sendcounts based on the graph communicator's destination order + int total_send = 0; + for (int i = 0; i < outdegree_out; i++) { + int dest_rank = destinations_out[i]; + elem_sendcounts[i] = static_cast(elems_to_send_by_rank[dest_rank].size()); + elem_sdispls[i] = total_send; + total_send += elem_sendcounts[i]; + } + + // Debug: Print send counts + if (print_info) { + for (int r = 0; r < world_size; ++r) { + MPI_Barrier(MPI_COMM_WORLD); + if (rank == r) { + std::cout << "[rank " << rank << "] Send counts: "; + for (int i = 0; i < outdegree_out; i++) { + std::cout << "to_rank_" << destinations_out[i] << "=" << elem_sendcounts[i] << " "; + } + std::cout << "(total=" << total_send << ")" << std::endl; + } + MPI_Barrier(MPI_COMM_WORLD); + } + } + + // ========== Build receive counts and displacements for INCOMING neighbors (sources) ========== + // - elem_recvcounts[i] = number of elements to receive from i-th incoming neighbor (sources_out[i]) + // - elem_rdispls[i] = starting position in recv buffer for i-th incoming neighbor + + std::vector elem_recvcounts(indegree_out, 0); + std::vector elem_rdispls(indegree_out, 0); + + // Count how many ghost elements come from each source rank + // ghost_elem_owner_ranks[i] tells us which rank owns the i-th ghost element + std::map> elems_to_recv_by_rank; // rank -> list of ghost element indices + + for (size_t i = 0; i < ghost_elem_owner_ranks.size(); i++) { + int source_rank = ghost_elem_owner_ranks[i]; + elems_to_recv_by_rank[source_rank].push_back(static_cast(i)); + } + + // Fill elem_recvcounts based on the graph communicator's source order + int total_recv = 0; + for (int i = 0; i < indegree_out; i++) { + int source_rank = sources_out[i]; + elem_recvcounts[i] = static_cast(elems_to_recv_by_rank[source_rank].size()); + elem_rdispls[i] = total_recv; + total_recv += elem_recvcounts[i]; + } + + // Debug: Print receive counts + if (print_info) { + for (int r = 0; r < world_size; ++r) { + MPI_Barrier(MPI_COMM_WORLD); + if (rank == r) { + std::cout << "[rank " << rank << "] Recv counts: "; + for (int i = 0; i < indegree_out; i++) { + std::cout << "from_rank_" << sources_out[i] << "=" << elem_recvcounts[i] << " "; + } + std::cout << "(total=" << total_recv << ", expected_ghosts=" << final_mesh.num_ghost_elems << ")" << std::endl; + } + MPI_Barrier(MPI_COMM_WORLD); + } + } + + // ========== Build send buffer organized by destination rank ========== + std::vector elem_send_buffer(total_send); + int send_idx = 0; + + for (int i = 0; i < outdegree_out; i++) { + int dest_rank = destinations_out[i]; + const auto& elems_for_this_rank = elems_to_send_by_rank[dest_rank]; + + for (int elem_lid : elems_for_this_rank) { + elem_send_buffer[send_idx++] = gauss_point.fields.host(elem_lid); + } + } + + // ========== Allocate receive buffer ========== + std::vector elem_recv_buffer(total_recv); + + // ========== Exchange data using MPI_Neighbor_alltoallv ========== + // MPI_Neighbor_alltoallv exchanges data with neighbors in the graph communicator topology + // - elem_sendcounts[i]: number of doubles to send to i-th outgoing neighbor + // - elem_recvcounts[i]: number of doubles to receive from i-th incoming neighbor + // - The order of neighbors must match the order returned by MPI_Dist_graph_neighbors + + MPI_Neighbor_alltoallv( + elem_send_buffer.data(), // Send buffer with boundary element data + elem_sendcounts.data(), // Number of elements to send to each outgoing neighbor [outdegree] + elem_sdispls.data(), // Displacement in send buffer for each outgoing neighbor [outdegree] + MPI_DOUBLE, // Send data type + elem_recv_buffer.data(), // Receive buffer for ghost element data + elem_recvcounts.data(), // Number of elements to receive from each incoming neighbor [indegree] + elem_rdispls.data(), // Displacement in recv buffer for each incoming neighbor [indegree] + MPI_DOUBLE, // Receive data type + graph_comm // Distributed graph communicator + ); + + // ========== Update ghost element fields from receive buffer ========== + // Unpack received data back into ghost elements in the correct order + + // Track which ghost elements have been updated for debugging + std::vector ghost_updated(final_mesh.num_ghost_elems, false); + + int recv_idx = 0; + for (int i = 0; i < indegree_out; i++) { + int source_rank = sources_out[i]; + const auto& ghost_indices = elems_to_recv_by_rank[source_rank]; + + for (int ghost_idx : ghost_indices) { + int ghost_elem_local_id = final_mesh.num_owned_elems + ghost_idx; + gauss_point.fields.host(ghost_elem_local_id) = elem_recv_buffer[recv_idx++]; + ghost_updated[ghost_idx] = true; + } + } + + // Debug: Check which ghosts weren't updated + if (print_info) { + std::vector missing_ghosts; + for (size_t i = 0; i < ghost_updated.size(); i++) { + if (!ghost_updated[i]) { + missing_ghosts.push_back(static_cast(i)); + } + } + + if (!missing_ghosts.empty()) { + for (int r = 0; r < world_size; ++r) { + MPI_Barrier(MPI_COMM_WORLD); + if (rank == r) { + std::cout << "[rank " << rank << "] WARNING: " << missing_ghosts.size() + << " ghost elements not in elems_to_recv_by_rank: "; + for (size_t i = 0; i < std::min(missing_ghosts.size(), size_t(10)); i++) { + std::cout << missing_ghosts[i] << " "; + } + if (missing_ghosts.size() > 10) std::cout << "..."; + std::cout << std::endl; + } + MPI_Barrier(MPI_COMM_WORLD); + } + } + } + + gauss_point.fields.update_device(); + + // ========== Verify the communication worked correctly ========== + bool comm_test_passed = true; + for (int i = final_mesh.num_owned_elems; i < final_mesh.num_elems; i++) { + if (gauss_point.fields.host(i) < 0.0) { + std::cerr << "[rank " << rank << "] ERROR: Ghost element " << i + << " was not updated (value = " << gauss_point.fields.host(i) << ")" << std::endl; + comm_test_passed = false; + } + } + + int local_test_passed = comm_test_passed ? 1 : 0; + int global_test_passed = 0; + MPI_Allreduce(&local_test_passed, &global_test_passed, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); + if (rank == 0) { + if (global_test_passed) { + std::cout << "\n✓ Element communication test PASSED on all ranks\n" << std::endl; + } else { + std::cout << "\n✗ Element communication test FAILED on one or more ranks\n" << std::endl; + } + } + MPI_Barrier(MPI_COMM_WORLD); + print_info = false; // Disable debug output after communication test + // Loop over all elements and average the values of elements connected to that element + for (int i = 0; i < final_mesh.num_elems; i++) { + double value = 0.0; + for (int j = 0; j < final_mesh.num_elems_in_elem(i); j++) { + value += gauss_point.fields.host(final_mesh.elems_in_elem(i, j)); + } + value /= final_mesh.num_elems_in_elem(i); + gauss_point.fields.host(i) = value; + } + gauss_point.fields.update_device(); + - // -------------------------------------------------------------------------------------- - // Build reverse map via global IDs: for each local node gid, find ranks that ghost it. - // Steps: - // 1) Each rank contributes its ghost node GIDs. - // 2) Allgatherv ghost node GIDs to build gid -> [ranks that ghost it]. - // 3) For each locally-owned node gid, lookup ranks that ghost it and record targets. - // -------------------------------------------------------------------------------------- +// -------------------------------------------------------------------------------------- +// Build the send pattern for nodes +// Build reverse map via global IDs: for each local node gid, find ranks that ghost it. +// Steps: +// 1) Each rank contributes its ghost node GIDs. +// 2) Allgatherv ghost node GIDs to build gid -> [ranks that ghost it]. +// 3) For each locally-owned node gid, lookup ranks that ghost it and record targets. +// -------------------------------------------------------------------------------------- std::vector>> boundary_node_targets(intermediate_mesh.num_nodes); @@ -2129,15 +2564,7 @@ void partition_mesh( MPI_Barrier(MPI_COMM_WORLD); if(rank == 0) std::cout << " Finished building node communication reverse map" << std::endl; - - - - // Build communication plans for elements and nodes - CommunicationPlan element_comm_plan; - CommunicationPlan node_comm_plan; - - element_comm_plan.build(intermediate_mesh, world_size, rank, boundary_elem_targets, boundary_elem_local_ids, boundary_to_ghost_ranks); - node_comm_plan.build(intermediate_mesh, world_size, rank, boundary_node_targets, boundary_node_local_ids, boundary_to_ghost_ranks); + } diff --git a/examples/mesh_decomp/mesh.h b/examples/mesh_decomp/mesh.h index a745e17e..01ad00c6 100644 --- a/examples/mesh_decomp/mesh.h +++ b/examples/mesh_decomp/mesh.h @@ -310,11 +310,17 @@ struct Mesh_t DCArrayKokkos local_to_global_node_mapping; ///< Local to global node mapping DCArrayKokkos local_to_global_elem_mapping; ///< Local to global element mapping + // Element communicaiton data definitions size_t num_owned_elems; ///< Number of owned elements on this rank - size_t num_ghost_elems; ///< Number of ghost elements on this rank (from neighboring MPI ranks) + size_t num_boundary_elems; ///< Number of boundary elements on this rank (send data to neighboring MPI ranks) + DCArrayKokkos boundary_elem_local_ids; ///< Local IDs of boundary elements on this rank (send data to neighboring MPI ranks) + size_t num_ghost_elems; ///< Number of ghost elements on this rank (receive data from neighboring MPI ranks) + // Node communicaiton data definitions size_t num_owned_nodes; ///< Number of owned nodes on this rank - size_t num_ghost_nodes; ///< Number of ghost nodes on this rank (from neighboring MPI ranks) + size_t num_boundary_nodes; ///< Number of boundary nodes on this rank (send data to neighboring MPI ranks) + DCArrayKokkos boundary_node_local_ids; ///< Local IDs of boundary nodes on this rank (send data to neighboring MPI ranks) + size_t num_ghost_nodes; ///< Number of ghost nodes on this rank (receive data from neighboring MPI ranks) diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp index 608c3867..b14ee9cd 100644 --- a/examples/mesh_decomp/mesh_decomp.cpp +++ b/examples/mesh_decomp/mesh_decomp.cpp @@ -44,6 +44,8 @@ int main(int argc, char** argv) { Mesh_t final_mesh; node_t final_node; + GaussPoint_t gauss_point; + // ******************************************************** // Build the initial mesh // ******************************************************** @@ -60,11 +62,11 @@ int main(int argc, char** argv) { // ******************************************************** // Partition and balance the mesh // ******************************************************** - partition_mesh(initial_mesh, final_mesh, initial_node, final_node, world_size, rank); + partition_mesh(initial_mesh, final_mesh, initial_node, final_node, gauss_point, world_size, rank); // write_vtk(intermediate_mesh, intermediate_node, rank); MPI_Barrier(MPI_COMM_WORLD); - write_vtu(final_mesh, final_node, rank, MPI_COMM_WORLD); + write_vtu(final_mesh, final_node, gauss_point, rank, MPI_COMM_WORLD); // write_vtk(final_mesh, final_node, rank); MPI_Barrier(MPI_COMM_WORLD); diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h index f0801777..77dac8d0 100644 --- a/examples/mesh_decomp/mesh_io.h +++ b/examples/mesh_decomp/mesh_io.h @@ -497,6 +497,7 @@ void build_3d_box( ///////////////////////////////////////////////////////////////////////////// void write_vtu(Mesh_t& mesh, node_t& node, + GaussPoint_t& gauss_point, int rank, MPI_Comm comm) { @@ -511,7 +512,7 @@ void write_vtu(Mesh_t& mesh, node.coords.update_host(); Kokkos::fence(); - const int num_cell_scalar_vars = 3; + const int num_cell_scalar_vars = 4; const int num_cell_vec_vars = 0; const int num_cell_tensor_vars = 0; @@ -520,7 +521,7 @@ void write_vtu(Mesh_t& mesh, // Scalar values associated with a cell const char cell_scalar_var_names[num_cell_scalar_vars][30] = { - "rank_id", "elems_in_elem_owned", "global_elem_id" + "rank_id", "elems_in_elem_owned", "global_elem_id", "field_value" }; const char point_scalar_var_names[num_point_scalar_vars][15] = { @@ -543,6 +544,7 @@ void write_vtu(Mesh_t& mesh, elem_fields(elem_gid, 0) = rank; elem_fields(elem_gid, 1) = (double)mesh.num_elems_in_elem(elem_gid); elem_fields(elem_gid, 2) = mesh.local_to_global_elem_mapping.host(elem_gid); + elem_fields(elem_gid, 3) = gauss_point.fields.host(elem_gid); } // save the vertex vector fields to an array for exporting to graphics files diff --git a/examples/mesh_decomp/mpi_type.h b/examples/mesh_decomp/mpi_type.h index 5ba78be9..35b73985 100644 --- a/examples/mesh_decomp/mpi_type.h +++ b/examples/mesh_decomp/mpi_type.h @@ -1,360 +1,360 @@ -#ifndef MPIDARRAYKOKKOS_H -#define MPIDARRAYKOKKOS_H - -#include "matar.h" -#include "communication_plan.h" - -using namespace mtr; - -///////////////////////// -// MPIDArrayKokkos: Dual type for managing distributed data on both CPU and GPU. -// -// Enhanced with automatic ghost synchronization via CommunicationPlan. -// Allocates space for owned + ghost items and provides communicate() method. -// -// Usage: -// node.coords.communicate() -> syncs ghost nodes automatically -// elem.density.communicate() -> syncs ghost elements automatically -///////////////////////// -template -class MPIDArrayKokkos { - - // this is manage - using TArray1D = Kokkos::DualView ; +// #ifndef MPIDARRAYKOKKOS_H +// #define MPIDARRAYKOKKOS_H + +// #include "matar.h" +// #include "communication_plan.h" + +// using namespace mtr; + +// ///////////////////////// +// // MPIDArrayKokkos: Dual type for managing distributed data on both CPU and GPU. +// // +// // Enhanced with automatic ghost synchronization via CommunicationPlan. +// // Allocates space for owned + ghost items and provides communicate() method. +// // +// // Usage: +// // node.coords.communicate() -> syncs ghost nodes automatically +// // elem.density.communicate() -> syncs ghost elements automatically +// ///////////////////////// +// template +// class MPIDArrayKokkos { + +// // this is manage +// using TArray1D = Kokkos::DualView ; -protected: - size_t dims_[7]; - size_t length_; - size_t order_; // tensor order (rank) - int mpi_recv_rank_; - int mpi_tag_; - MPI_Comm mpi_comm_; - MPI_Status mpi_status_; - MPI_Datatype mpi_datatype_; - MPI_Request mpi_request_; - TArray1D this_array_; +// protected: +// size_t dims_[7]; +// size_t length_; +// size_t order_; // tensor order (rank) +// int mpi_recv_rank_; +// int mpi_tag_; +// MPI_Comm mpi_comm_; +// MPI_Status mpi_status_; +// MPI_Datatype mpi_datatype_; +// MPI_Request mpi_request_; +// TArray1D this_array_; - // --- Ghost Communication Support --- - CommunicationPlan* comm_plan_; // Pointer to shared communication plan - size_t num_owned_items_; // Number of owned items (nodes/elements) - size_t num_total_items_; // Total items including ghosts (owned + ghost) - size_t num_fields_; // Fields per item (e.g., 3 for 3D coordinates) +// // --- Ghost Communication Support --- +// CommunicationPlan* comm_plan_; // Pointer to shared communication plan +// size_t num_owned_items_; // Number of owned items (nodes/elements) +// size_t num_total_items_; // Total items including ghosts (owned + ghost) +// size_t num_fields_; // Fields per item (e.g., 3 for 3D coordinates) - void set_mpi_type(); +// void set_mpi_type(); -public: - // Data member to access host view - ViewCArray host; +// public: +// // Data member to access host view +// ViewCArray host; - MPIDArrayKokkos(); +// MPIDArrayKokkos(); - MPIDArrayKokkos(size_t dim0, const std::string& tag_string = DEFAULTSTRINGARRAY); +// MPIDArrayKokkos(size_t dim0, const std::string& tag_string = DEFAULTSTRINGARRAY); - MPIDArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY); +// MPIDArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY); - MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY); +// MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY); - MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, - size_t dim3, const std::string& tag_string = DEFAULTSTRINGARRAY); +// MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, +// size_t dim3, const std::string& tag_string = DEFAULTSTRINGARRAY); - MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, - size_t dim3, size_t dim4, const std::string& tag_string = DEFAULTSTRINGARRAY); +// MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, +// size_t dim3, size_t dim4, const std::string& tag_string = DEFAULTSTRINGARRAY); - MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, - size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string = DEFAULTSTRINGARRAY); +// MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, +// size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string = DEFAULTSTRINGARRAY); - MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, - size_t dim3, size_t dim4, size_t dim5, - size_t dim6, const std::string& tag_string = DEFAULTSTRINGARRAY); +// MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, +// size_t dim3, size_t dim4, size_t dim5, +// size_t dim6, const std::string& tag_string = DEFAULTSTRINGARRAY); - // ======================================================================== - // DISTRIBUTED COMMUNICATION METHODS (NEW) - // ======================================================================== +// // ======================================================================== +// // DISTRIBUTED COMMUNICATION METHODS (NEW) +// // ======================================================================== - /** - * @brief Set communication plan and ghost metadata - * - * Call this ONCE after allocating the array to enable ghost communication. - * Multiple fields can share the same CommunicationPlan pointer. - * - * @param plan Pointer to shared CommunicationPlan (node or element plan) - * @param num_owned Number of owned items on this rank - * @param num_total Total items including ghosts (owned + ghost) - * - * Example: - * node.coords = MPIDArrayKokkos(num_total_nodes, 3); - * node.coords.set_communication_plan(&node_comm_plan, num_owned_nodes, num_total_nodes); - */ - void set_communication_plan(CommunicationPlan* plan, size_t num_owned, size_t num_total); +// /** +// * @brief Set communication plan and ghost metadata +// * +// * Call this ONCE after allocating the array to enable ghost communication. +// * Multiple fields can share the same CommunicationPlan pointer. +// * +// * @param plan Pointer to shared CommunicationPlan (node or element plan) +// * @param num_owned Number of owned items on this rank +// * @param num_total Total items including ghosts (owned + ghost) +// * +// * Example: +// * node.coords = MPIDArrayKokkos(num_total_nodes, 3); +// * node.coords.set_communication_plan(&node_comm_plan, num_owned_nodes, num_total_nodes); +// */ +// void set_communication_plan(CommunicationPlan* plan, size_t num_owned, size_t num_total); - /** - * @brief Synchronize ghost data using neighborhood collectives - * - * Automatically exchanges boundary → ghost data for this field. - * Uses the CommunicationPlan provided via set_communication_plan(). - * - * Workflow: - * 1. Updates host data from device (if needed) - * 2. Packs owned boundary items - * 3. Calls MPI_Neighbor_alltoallv (via comm_plan) - * 4. Unpacks into ghost items - * 5. Updates device with new ghost data - * - * Example usage: - * // Update owned nodes - * for (int i = 0; i < num_owned_nodes; i++) { - * node.coords(i, 0) += dt * velocity(i, 0); - * } - * - * // Sync ghosts - * node.coords.communicate(); - * - * // Now ghost data is current - */ - void communicate(); +// /** +// * @brief Synchronize ghost data using neighborhood collectives +// * +// * Automatically exchanges boundary → ghost data for this field. +// * Uses the CommunicationPlan provided via set_communication_plan(). +// * +// * Workflow: +// * 1. Updates host data from device (if needed) +// * 2. Packs owned boundary items +// * 3. Calls MPI_Neighbor_alltoallv (via comm_plan) +// * 4. Unpacks into ghost items +// * 5. Updates device with new ghost data +// * +// * Example usage: +// * // Update owned nodes +// * for (int i = 0; i < num_owned_nodes; i++) { +// * node.coords(i, 0) += dt * velocity(i, 0); +// * } +// * +// * // Sync ghosts +// * node.coords.communicate(); +// * +// * // Now ghost data is current +// */ +// void communicate(); - /** - * @brief Non-blocking version: start ghost exchange - * - * For advanced users who want to overlap computation with communication. - * Must call communicate_wait() before accessing ghost data. - */ - void communicate_begin(); +// /** +// * @brief Non-blocking version: start ghost exchange +// * +// * For advanced users who want to overlap computation with communication. +// * Must call communicate_wait() before accessing ghost data. +// */ +// void communicate_begin(); - /** - * @brief Wait for non-blocking ghost exchange to complete - */ - void communicate_wait(); +// /** +// * @brief Wait for non-blocking ghost exchange to complete +// */ +// void communicate_wait(); - /** - * @brief Get number of owned items (excludes ghosts) - */ - KOKKOS_INLINE_FUNCTION - size_t num_owned() const { return num_owned_items_; } +// /** +// * @brief Get number of owned items (excludes ghosts) +// */ +// KOKKOS_INLINE_FUNCTION +// size_t num_owned() const { return num_owned_items_; } - /** - * @brief Get total items including ghosts - */ - KOKKOS_INLINE_FUNCTION - size_t num_total() const { return num_total_items_; } +// /** +// * @brief Get total items including ghosts +// */ +// KOKKOS_INLINE_FUNCTION +// size_t num_total() const { return num_total_items_; } - /** - * @brief Check if ghost communication is configured - */ - bool has_communication_plan() const { return comm_plan_ != nullptr; } +// /** +// * @brief Check if ghost communication is configured +// */ +// bool has_communication_plan() const { return comm_plan_ != nullptr; } - // These functions can setup the data needed for halo send/receives - // Not necessary for standard MPI comms - void mpi_setup(); +// // These functions can setup the data needed for halo send/receives +// // Not necessary for standard MPI comms +// void mpi_setup(); - void mpi_setup(int recv_rank); +// void mpi_setup(int recv_rank); - void mpi_setup(int recv_rank, int tag); +// void mpi_setup(int recv_rank, int tag); - void mpi_setup(int recv_rank, int tag, MPI_Comm comm); +// void mpi_setup(int recv_rank, int tag, MPI_Comm comm); - void mpi_set_rank(int recv_rank); +// void mpi_set_rank(int recv_rank); - void mpi_set_tag(int tag); +// void mpi_set_tag(int tag); - void mpi_set_comm(MPI_Comm comm); +// void mpi_set_comm(MPI_Comm comm); - int get_rank(); +// int get_rank(); - int get_tag(); +// int get_tag(); - MPI_Comm get_comm(); +// MPI_Comm get_comm(); - KOKKOS_INLINE_FUNCTION - T& operator()(size_t i) const; +// KOKKOS_INLINE_FUNCTION +// T& operator()(size_t i) const; - KOKKOS_INLINE_FUNCTION - T& operator()(size_t i, size_t j) const; +// KOKKOS_INLINE_FUNCTION +// T& operator()(size_t i, size_t j) const; - KOKKOS_INLINE_FUNCTION - T& operator()(size_t i, size_t j, size_t k) const; +// KOKKOS_INLINE_FUNCTION +// T& operator()(size_t i, size_t j, size_t k) const; - KOKKOS_INLINE_FUNCTION - T& operator()(size_t i, size_t j, size_t k, size_t l) const; +// KOKKOS_INLINE_FUNCTION +// T& operator()(size_t i, size_t j, size_t k, size_t l) const; - KOKKOS_INLINE_FUNCTION - T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m) const; +// KOKKOS_INLINE_FUNCTION +// T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m) const; - KOKKOS_INLINE_FUNCTION - T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m, - size_t n) const; +// KOKKOS_INLINE_FUNCTION +// T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m, +// size_t n) const; - KOKKOS_INLINE_FUNCTION - T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m, - size_t n, size_t o) const; +// KOKKOS_INLINE_FUNCTION +// T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m, +// size_t n, size_t o) const; - KOKKOS_INLINE_FUNCTION - MPIDArrayKokkos& operator=(const MPIDArrayKokkos& temp); +// KOKKOS_INLINE_FUNCTION +// MPIDArrayKokkos& operator=(const MPIDArrayKokkos& temp); - // GPU Method - // Method that returns size - KOKKOS_INLINE_FUNCTION - size_t size() const; +// // GPU Method +// // Method that returns size +// KOKKOS_INLINE_FUNCTION +// size_t size() const; - // Host Method - // Method that returns size - KOKKOS_INLINE_FUNCTION - size_t extent() const; +// // Host Method +// // Method that returns size +// KOKKOS_INLINE_FUNCTION +// size_t extent() const; - KOKKOS_INLINE_FUNCTION - size_t dims(size_t i) const; +// KOKKOS_INLINE_FUNCTION +// size_t dims(size_t i) const; - KOKKOS_INLINE_FUNCTION - size_t order() const; +// KOKKOS_INLINE_FUNCTION +// size_t order() const; - // Method returns the raw device pointer of the Kokkos DualView - KOKKOS_INLINE_FUNCTION - T* device_pointer() const; +// // Method returns the raw device pointer of the Kokkos DualView +// KOKKOS_INLINE_FUNCTION +// T* device_pointer() const; - // Method returns the raw host pointer of the Kokkos DualView - KOKKOS_INLINE_FUNCTION - T* host_pointer() const; +// // Method returns the raw host pointer of the Kokkos DualView +// KOKKOS_INLINE_FUNCTION +// T* host_pointer() const; - // Method returns kokkos dual view - KOKKOS_INLINE_FUNCTION - TArray1D get_kokkos_dual_view() const; +// // Method returns kokkos dual view +// KOKKOS_INLINE_FUNCTION +// TArray1D get_kokkos_dual_view() const; - // Method that update host view - void update_host(); +// // Method that update host view +// void update_host(); - // Method that update device view - void update_device(); +// // Method that update device view +// void update_device(); - // Deconstructor - virtual KOKKOS_INLINE_FUNCTION - ~MPIDArrayKokkos (); -}; // End of MPIDArrayKokkos - - -// ============================================================================ -// INLINE IMPLEMENTATIONS - DISTRIBUTED COMMUNICATION -// ============================================================================ - -/** - * @brief Default constructor - initialize ghost communication members - */ -template -KOKKOS_INLINE_FUNCTION -MPIDArrayKokkos::MPIDArrayKokkos() - : comm_plan_(nullptr), - num_owned_items_(0), - num_total_items_(0), - num_fields_(0) -{ - // Base constructor handles array initialization -} - - -/** - * @brief Set communication plan and ghost metadata - */ -template -inline void MPIDArrayKokkos::set_communication_plan( - CommunicationPlan* plan, - size_t num_owned, - size_t num_total) -{ - comm_plan_ = plan; - num_owned_items_ = num_owned; - num_total_items_ = num_total; +// // Deconstructor +// virtual KOKKOS_INLINE_FUNCTION +// ~MPIDArrayKokkos (); +// }; // End of MPIDArrayKokkos + + +// // ============================================================================ +// // INLINE IMPLEMENTATIONS - DISTRIBUTED COMMUNICATION +// // ============================================================================ + +// /** +// * @brief Default constructor - initialize ghost communication members +// */ +// template +// KOKKOS_INLINE_FUNCTION +// MPIDArrayKokkos::MPIDArrayKokkos() +// : comm_plan_(nullptr), +// num_owned_items_(0), +// num_total_items_(0), +// num_fields_(0) +// { +// // Base constructor handles array initialization +// } + + +// /** +// * @brief Set communication plan and ghost metadata +// */ +// template +// inline void MPIDArrayKokkos::set_communication_plan( +// CommunicationPlan* plan, +// size_t num_owned, +// size_t num_total) +// { +// comm_plan_ = plan; +// num_owned_items_ = num_owned; +// num_total_items_ = num_total; - // Infer number of fields from array dimensions - // Assumption: dim0 = num_items, dim1+ = fields - if (order_ == 1) { - num_fields_ = 1; // Scalar field - } else if (order_ == 2) { - num_fields_ = dims_[1]; // Vector field (e.g., coords[num_nodes, 3]) - } else { - // For higher order tensors, treat everything after dim0 as fields - num_fields_ = 1; - for (size_t i = 1; i < order_; i++) { - num_fields_ *= dims_[i]; - } - } +// // Infer number of fields from array dimensions +// // Assumption: dim0 = num_items, dim1+ = fields +// if (order_ == 1) { +// num_fields_ = 1; // Scalar field +// } else if (order_ == 2) { +// num_fields_ = dims_[1]; // Vector field (e.g., coords[num_nodes, 3]) +// } else { +// // For higher order tensors, treat everything after dim0 as fields +// num_fields_ = 1; +// for (size_t i = 1; i < order_; i++) { +// num_fields_ *= dims_[i]; +// } +// } - // Validate dimensions match total items - if (dims_[0] != num_total) { - std::cerr << "Error: Array dim0 (" << dims_[0] << ") does not match num_total (" - << num_total << ")" << std::endl; - std::cerr << " Array must be allocated with size = num_owned + num_ghost" << std::endl; - } -} - - -/** - * @brief Synchronize ghost data using neighborhood collectives - */ -template -inline void MPIDArrayKokkos::communicate() -{ - if (!comm_plan_) { - std::cerr << "Error: CommunicationPlan not set. Call set_communication_plan() first." << std::endl; - return; - } +// // Validate dimensions match total items +// if (dims_[0] != num_total) { +// std::cerr << "Error: Array dim0 (" << dims_[0] << ") does not match num_total (" +// << num_total << ")" << std::endl; +// std::cerr << " Array must be allocated with size = num_owned + num_ghost" << std::endl; +// } +// } + + +// /** +// * @brief Synchronize ghost data using neighborhood collectives +// */ +// template +// inline void MPIDArrayKokkos::communicate() +// { +// if (!comm_plan_) { +// std::cerr << "Error: CommunicationPlan not set. Call set_communication_plan() first." << std::endl; +// return; +// } - if (!comm_plan_->has_graph_comm) { - std::cerr << "Error: Graph communicator not initialized in CommunicationPlan." << std::endl; - std::cerr << " Call comm_plan.create_graph_communicator() first." << std::endl; - return; - } +// if (!comm_plan_->has_graph_comm) { +// std::cerr << "Error: Graph communicator not initialized in CommunicationPlan." << std::endl; +// std::cerr << " Call comm_plan.create_graph_communicator() first." << std::endl; +// return; +// } - // 1. Update host from device (ensure data is current on CPU for MPI) - this->update_host(); +// // 1. Update host from device (ensure data is current on CPU for MPI) +// this->update_host(); - // 2. Get raw pointer to data - T* data_ptr = this->host_pointer(); +// // 2. Get raw pointer to data +// T* data_ptr = this->host_pointer(); - // 3. Convert to double* for MPI communication - // TODO: Support other types (int, float, etc.) with template specialization - static_assert(std::is_same::value, - "Currently only double supported for ghost communication"); +// // 3. Convert to double* for MPI communication +// // TODO: Support other types (int, float, etc.) with template specialization +// static_assert(std::is_same::value, +// "Currently only double supported for ghost communication"); - double* double_ptr = reinterpret_cast(data_ptr); +// double* double_ptr = reinterpret_cast(data_ptr); - // 4. Call neighborhood collective exchange - comm_plan_->exchange_ghosts_neighborhood(double_ptr, static_cast(num_fields_)); +// // 4. Call neighborhood collective exchange +// comm_plan_->exchange_ghosts_neighborhood(double_ptr, static_cast(num_fields_)); - // 5. Update device with new ghost data - this->update_device(); -} - - -/** - * @brief Non-blocking version: start ghost exchange - */ -template -inline void MPIDArrayKokkos::communicate_begin() -{ - // TODO: Implement non-blocking version using Isend/Irecv - // For now, just call blocking version - std::cerr << "Warning: communicate_begin() not yet implemented, using blocking communicate()" << std::endl; - communicate(); -} - - -/** - * @brief Wait for non-blocking ghost exchange to complete - */ -template -inline void MPIDArrayKokkos::communicate_wait() -{ - // TODO: Implement non-blocking version - // For now, this is a no-op since communicate_begin() is blocking -} - - -#endif // MPIDARRAYKOKKOS_H +// // 5. Update device with new ghost data +// this->update_device(); +// } + + +// /** +// * @brief Non-blocking version: start ghost exchange +// */ +// template +// inline void MPIDArrayKokkos::communicate_begin() +// { +// // TODO: Implement non-blocking version using Isend/Irecv +// // For now, just call blocking version +// std::cerr << "Warning: communicate_begin() not yet implemented, using blocking communicate()" << std::endl; +// communicate(); +// } + + +// /** +// * @brief Wait for non-blocking ghost exchange to complete +// */ +// template +// inline void MPIDArrayKokkos::communicate_wait() +// { +// // TODO: Implement non-blocking version +// // For now, this is a no-op since communicate_begin() is blocking +// } + + +// #endif // MPIDARRAYKOKKOS_H diff --git a/examples/mesh_decomp/state.h b/examples/mesh_decomp/state.h index 8afb9abf..01f54624 100644 --- a/examples/mesh_decomp/state.h +++ b/examples/mesh_decomp/state.h @@ -82,7 +82,7 @@ struct node_t // Possible gauss point states, used to initialize GaussPoint_t enum class gauss_pt_state { - volume + fields }; ///////////////////////////////////////////////////////////////////////////// @@ -95,7 +95,7 @@ enum class gauss_pt_state struct GaussPoint_t { - DCArrayKokkos vol; ///< GaussPoint volume + DCArrayKokkos fields; ///< GaussPoint fields // initialization method (num_cells, num_dims) @@ -104,8 +104,8 @@ struct GaussPoint_t for (auto field : gauss_pt_states){ switch(field){ - case gauss_pt_state::volume: - if (vol.size() == 0) this->vol = DCArrayKokkos(num_gauss_pnts, "gauss_point_volume"); + case gauss_pt_state::fields: + if (fields.size() == 0) this->fields = DCArrayKokkos(num_gauss_pnts, "gauss_point_fields"); break; default: std::cout<<"Desired gauss point state not understood in GaussPoint_t initialize"< Date: Wed, 5 Nov 2025 11:34:25 -0600 Subject: [PATCH 18/52] ENH: Working on defining the communication plan for MPI types --- examples/mesh_decomp/communication_plan.h | 534 ++++++++-------------- examples/mesh_decomp/decomp_utils.h | 165 ++----- examples/mesh_decomp/mpi_type.h | 366 +++++++-------- 3 files changed, 413 insertions(+), 652 deletions(-) diff --git a/examples/mesh_decomp/communication_plan.h b/examples/mesh_decomp/communication_plan.h index 7c6f9ecb..1c95a40a 100644 --- a/examples/mesh_decomp/communication_plan.h +++ b/examples/mesh_decomp/communication_plan.h @@ -1,348 +1,216 @@ -// /** -// * @struct CommunicationPlan -// * @brief Manages efficient MPI communication for ghost element and node data exchange -// * -// * Pure data-oriented design with only flat, contiguous arrays for maximum cache efficiency. -// * Designed to be embedded in distributed data structures for automatic ghost synchronization. -// * -// * Usage pattern in distributed structures: -// * node.velocity.comm() -> automatically syncs ghost nodes -// * elem.density.comm() -> automatically syncs ghost elements -// * -// */ -// struct CommunicationPlan { - -// // ======================================================================== -// // CORE DATA STRUCTURES - FLAT ARRAYS ONLY -// // ======================================================================== +/** + * @struct CommunicationPlan + * @brief Manages efficient MPI communication for ghost element and node data exchange + * + * Pure data-oriented design with only flat, contiguous arrays for maximum cache efficiency. + * Designed to be embedded in distributed data structures for automatic ghost synchronization. + * + * Usage pattern in distributed structures: + * node.velocity.comm() -> automatically syncs ghost nodes + * elem.density.comm() -> automatically syncs ghost elements + * + */ + struct CommunicationPlan { + + // ======================================================================== + // Metadata for MPI neighbor graph communication + // ======================================================================== + // MPI world communicator + MPI_Comm mpi_comm_world; + bool has_comm_world = false; + int world_size = -1; -// // --- Ghost Send Plan: Owned elements/nodes -> destination ranks --- (Works for both elements and nodes) -// int num_send_ranks; // Number of destination ranks -// DCArrayKokkos send_rank_ids; // [size: num_send_ranks] Destination rank IDs -// DCArrayKokkos send_ghost_offsets; // [size: num_send_ranks+1] CSR offsets into send_ghost_lids -// DCArrayKokkos send_ghost_lids; // [size: total_send_ghosts] Local IDs of owned elements/nodes to send -// std::vector send_ghost_gids; // [size: total_send_ghosts] Global IDs (for debug/validation) - -// // --- Ghost Receive Plan: Ghost elements/nodes <- source ranks --- (Works for both elements and nodes) -// int num_recv_ranks; // Number of source ranks -// DCArrayKokkos recv_rank_ids; // [size: num_recv_ranks] Source rank IDs -// DCArrayKokkos recv_ghost_offsets; // [size: num_recv_ranks+1] CSR offsets into recv_ghost_lids -// DCArrayKokkos recv_ghost_lids; // [size: total_recv_ghosts] Local IDs of ghost elements/nodes (>= num_owned) -// std::vector recv_ghost_gids; // [size: total_recv_ghosts] Global IDs + // MPI graph communicator + MPI_Comm mpi_comm_graph; + bool has_comm_graph = false; - -// DCArrayKokkos send_requests; // Request handles for sends -// DCArrayKokkos recv_requests; // Request handles for receives -// DCArrayKokkos mpi_statuses; // Status array for MPI_Waitall - -// // --- Persistent communication (optional optimization) --- -// DCArrayKokkos persistent_send_requests; -// DCArrayKokkos persistent_recv_requests; -// bool has_persistent_comm; - - -// // --- Distributed Graph Topology for Neighborhood Collectives --- -// MPI_Comm graph_comm; // Graph communicator encoding sparse communication pattern -// bool has_graph_comm; // Whether graph communicator is initialized - -// // Counts and displacements for MPI_Neighbor_alltoallv -// DCArrayKokkos send_counts; // [num_send_ranks] Number of items to send per neighbor -// DCArrayKokkos send_displs; // [num_send_ranks] Displacements in send buffer -// DCArrayKokkos recv_counts; // [num_recv_ranks] Number of items to recv per neighbor -// DCArrayKokkos recv_displs; // [num_recv_ranks] Displacements in recv buffer - -// // --- Persistent Neighborhood Collectives (MPI-4.0+) --- -// MPI_Request persistent_neighbor_request; // Persistent request for neighborhood collective -// bool has_persistent_neighbor; // Whether persistent neighborhood is initialized -// int persistent_num_fields; // Fields per item for persistent request - - -// // ======================================================================== -// // CONSTRUCTOR / INITIALIZATION -// // ======================================================================== - -// CommunicationPlan() -// : num_send_ranks(0), num_recv_ranks(0), -// has_persistent_comm(false), -// has_graph_comm(false), -// has_persistent_neighbor(false), -// graph_comm(MPI_COMM_NULL), -// persistent_neighbor_request(MPI_REQUEST_NULL), -// persistent_num_fields(0) {} - - -// // Destructor to free MPI resources -// ~CommunicationPlan() { -// // Free persistent neighborhood collective -// if (has_persistent_neighbor && persistent_neighbor_request != MPI_REQUEST_NULL) { -// MPI_Request_free(&persistent_neighbor_request); -// } - -// // Free graph communicator -// if (has_graph_comm && graph_comm != MPI_COMM_NULL) { -// MPI_Comm_free(&graph_comm); -// } -// } - - -// void initialize(int num_send_ranks, int num_recv_ranks){ -// this->num_send_ranks = num_send_ranks; -// this->num_recv_ranks = num_recv_ranks; - -// send_rank_ids = DCArrayKokkos(num_send_ranks, "send_rank_ids"); -// recv_rank_ids = DCArrayKokkos(num_recv_ranks, "recv_rank_ids"); -// send_ghost_offsets = DCArrayKokkos(num_send_ranks + 1, "send_ghost_offsets"); -// recv_ghost_offsets = DCArrayKokkos(num_recv_ranks + 1, "recv_ghost_offsets"); -// send_ghost_lids = DCArrayKokkos(total_send_ghosts, "send_ghost_lids"); -// recv_ghost_lids = DCArrayKokkos(total_recv_ghosts, "recv_ghost_lids"); -// send_ghost_gids = std::vector(total_send_ghosts, "send_ghost_gids"); -// recv_ghost_gids = std::vector(total_recv_ghosts, "recv_ghost_gids"); -// send_requests = DCArrayKokkos(total_send_ghosts, "send_requests"); -// recv_requests = DCArrayKokkos(total_recv_ghosts, "recv_requests"); -// mpi_statuses = DCArrayKokkos(total_send_ghosts + total_recv_ghosts, "mpi_statuses"); -// persistent_send_requests = DCArrayKokkos(total_send_ghosts, "persistent_send_requests"); -// persistent_recv_requests = DCArrayKokkos(total_recv_ghosts, "persistent_recv_requests"); -// send_counts = DCArrayKokkos(num_send_ranks, "send_counts"); -// send_displs = DCArrayKokkos(num_send_ranks, "send_displs"); -// recv_counts = DCArrayKokkos(num_recv_ranks, "recv_counts"); -// recv_displs = DCArrayKokkos(num_recv_ranks, "recv_displs"); - -// } - + // Number of send and recv ranks + int num_send_ranks; // In MPI language, this is the outdegree of the graph communicator + int num_recv_ranks; // In MPI language, this is the indegree of the graph communicator + + // Rank IDs for send and recv ranks + DCArrayKokkos send_rank_ids; // [size: num_send_ranks] Destination rank IDs + DCArrayKokkos recv_rank_ids; // [size: num_recv_ranks] Source rank IDs + + // recv_weights: Weights on incoming edges (not used here, set to MPI_UNWEIGHTED) + // Could be used to specify communication volume if needed for optimization + int* recv_weights = MPI_UNWEIGHTED; // [size: num_recv_ranks] Weights on incoming edges, set to MPI_UNWEIGHTED if not used + + // send_weights: Weights on outgoing edges (not used here, set to MPI_UNWEIGHTED) + // Could be used to specify communication volume if needed for optimization + int* send_weights = MPI_UNWEIGHTED; // [size: num_send_ranks] Weights on outgoing edges, set to MPI_UNWEIGHTED if not used + + // info: Hints for optimization (MPI_INFO_NULL means use defaults) + MPI_Info info = MPI_INFO_NULL; + + // reorder: Whether to allow MPI to reorder ranks for optimization (0=no reordering) + // Setting to 0 preserves original rank numbering + // Note: In the future, we may want to allow MPI to reorder ranks for optimization by setting to 1, + // this would allow MPI to reorder the ranks to make them physically closer on the hardware. + // This is a good optimization for large meshes, but will require maps from MPI_comm_world rank IDs to the new reordered rank IDs. + int reorder = 0; -// // ======================================================================== -// // INLINE IMPLEMENTATIONS - NEIGHBORHOOD COLLECTIVES -// // ======================================================================== + // ======================================================================== + // CONSTRUCTOR / INITIALIZATION + // ======================================================================== -// /** -// * @brief Create distributed graph communicator from communication pattern -// */ -// inline void create_graph_communicator(MPI_Comm base_comm) { - -// if (has_graph_comm) { -// std::cerr << "Warning: Graph communicator already created, skipping." << std::endl; -// return; -// } - -// int indegree = num_recv_ranks; // Number of ranks we receive FROM -// int outdegree = num_send_ranks; // Number of ranks we send TO - -// // Create the distributed graph communicator -// // MPI_Dist_graph_create_adjacent signature: -// // (comm_old, indegree, sources[], sourceweights, outdegree, dests[], destweights, -// // info, reorder, comm_dist_graph) -// int reorder = 0; // Don't reorder ranks (keep same as base_comm) - -// MPI_Dist_graph_create_adjacent( -// base_comm, // Base communicator -// indegree, // We receive from num_recv_ranks neighbors -// recv_rank_ids.data(), // Source ranks (we receive from these) -// MPI_UNWEIGHTED, // No edge weights for sources -// outdegree, // We send to num_send_ranks neighbors -// send_rank_ids.data(), // Destination ranks (we send to these) -// MPI_UNWEIGHTED, // No edge weights for destinations -// MPI_INFO_NULL, // No special hints -// reorder, // Don't reorder ranks -// &graph_comm // Output: new graph communicator -// ); - -// has_graph_comm = true; - -// // Pre-allocate counts and displacements arrays -// send_counts.resize(num_send_ranks); -// send_displs.resize(num_send_ranks); -// recv_counts.resize(num_recv_ranks); -// recv_displs.resize(num_recv_ranks); -// } + CommunicationPlan() + : num_send_ranks(0), num_recv_ranks(0), + has_comm_graph(false) {} -// /** -// * @brief Exchange ghost data using MPI_Neighbor_alltoallv -// */ -// inline void exchange_ghosts_neighborhood(double* data_ptr, int num_fields) { - -// if (!has_graph_comm) { -// std::cerr << "Error: Must call create_graph_communicator() first!" << std::endl; -// return; -// } - -// // 1. Pack send buffer from owned items -// int total_send = send_ghost_lids.size(); -// ghost_send_buffer.resize(total_send * num_fields); - -// for (size_t i = 0; i < send_ghost_lids.size(); i++) { -// int local_id = send_ghost_lids[i]; -// for (int f = 0; f < num_fields; f++) { -// ghost_send_buffer[i * num_fields + f] = data_ptr[local_id * num_fields + f]; -// } -// } - -// // 2. Update counts and displacements for this num_fields -// for (int i = 0; i < num_send_ranks; i++) { -// int start_idx = send_ghost_offsets[i]; -// int end_idx = send_ghost_offsets[i + 1]; -// send_counts[i] = (end_idx - start_idx) * num_fields; -// send_displs[i] = start_idx * num_fields; -// } - -// int total_recv = recv_ghost_lids.size(); -// ghost_recv_buffer.resize(total_recv * num_fields); - -// for (int i = 0; i < num_recv_ranks; i++) { -// int start_idx = recv_ghost_offsets[i]; -// int end_idx = recv_ghost_offsets[i + 1]; -// recv_counts[i] = (end_idx - start_idx) * num_fields; -// recv_displs[i] = start_idx * num_fields; -// } - -// // 3. Execute neighborhood collective (BLOCKING but fast with graph_comm) -// // MPI_Neighbor_alltoallv signature: -// // (sendbuf, sendcounts[], sdispls[], sendtype, -// // recvbuf, recvcounts[], rdispls[], recvtype, comm) -// MPI_Neighbor_alltoallv( -// ghost_send_buffer.data(), // Send buffer -// send_counts.data(), // Send counts per neighbor -// send_displs.data(), // Send displacements -// MPI_DOUBLE, // Send type -// ghost_recv_buffer.data(), // Receive buffer -// recv_counts.data(), // Receive counts per neighbor -// recv_displs.data(), // Receive displacements -// MPI_DOUBLE, // Receive type -// graph_comm // Graph communicator (NOT MPI_COMM_WORLD!) -// ); - -// // 4. Unpack receive buffer into ghost items -// for (size_t i = 0; i < recv_ghost_lids.size(); i++) { -// int ghost_local_id = recv_ghost_lids[i]; -// for (int f = 0; f < num_fields; f++) { -// data_ptr[ghost_local_id * num_fields + f] = ghost_recv_buffer[i * num_fields + f]; -// } -// } -// } - + // Destructor to free MPI resources + ~CommunicationPlan() { + // Free graph communicator + if (has_comm_graph && mpi_comm_graph != MPI_COMM_NULL) { + MPI_Comm_free(&mpi_comm_graph); + } + } -// /** -// * @brief Initialize persistent neighborhood collective (MPI-4.0+) -// */ -// inline void init_persistent_neighborhood(int num_fields) { - -// if (!has_graph_comm) { -// std::cerr << "Error: Must call create_graph_communicator() first!" << std::endl; -// return; -// } - -// if (has_persistent_neighbor) { -// std::cerr << "Warning: Persistent neighborhood already initialized, freeing and re-creating." << std::endl; -// free_persistent_neighborhood(); -// } - -// persistent_num_fields = num_fields; - -// // Allocate buffers -// int total_send = send_ghost_lids.size(); -// int total_recv = recv_ghost_lids.size(); -// ghost_send_buffer.resize(total_send * num_fields); -// ghost_recv_buffer.resize(total_recv * num_fields); - -// // Setup counts and displacements for persistent request -// for (int i = 0; i < num_send_ranks; i++) { -// int start_idx = send_ghost_offsets[i]; -// int end_idx = send_ghost_offsets[i + 1]; -// send_counts[i] = (end_idx - start_idx) * num_fields; -// send_displs[i] = start_idx * num_fields; -// } - -// for (int i = 0; i < num_recv_ranks; i++) { -// int start_idx = recv_ghost_offsets[i]; -// int end_idx = recv_ghost_offsets[i + 1]; -// recv_counts[i] = (end_idx - start_idx) * num_fields; -// recv_displs[i] = start_idx * num_fields; -// } - -// #if MPI_VERSION >= 4 -// // MPI-4.0+ persistent neighborhood collective -// // MPI_Neighbor_alltoallv_init signature (similar to MPI_Neighbor_alltoallv but creates request): -// // (sendbuf, sendcounts[], sdispls[], sendtype, -// // recvbuf, recvcounts[], rdispls[], recvtype, comm, info, request) -// MPI_Neighbor_alltoallv_init( -// ghost_send_buffer.data(), send_counts.data(), send_displs.data(), MPI_DOUBLE, -// ghost_recv_buffer.data(), recv_counts.data(), recv_displs.data(), MPI_DOUBLE, -// graph_comm, -// MPI_INFO_NULL, -// &persistent_neighbor_request -// ); -// has_persistent_neighbor = true; -// #else -// int rank; -// MPI_Comm_rank(MPI_COMM_WORLD, &rank); -// if (rank == 0) { -// std::cerr << "Warning: MPI-4.0 required for persistent neighborhood collectives" << std::endl; -// std::cerr << " Detected MPI version: " << MPI_VERSION << "." << MPI_SUBVERSION << std::endl; -// std::cerr << " Will fall back to standard neighborhood collective" << std::endl; -// } -// has_persistent_neighbor = false; -// #endif -// } + void initialize(MPI_Comm comm_world){ + this->mpi_comm_world = comm_world; + has_comm_world = true; + MPI_Comm_size(comm_world, &world_size); + } -// /** -// * @brief Exchange ghosts using persistent neighborhood collective (FASTEST) -// */ -// inline void exchange_ghosts_persistent(double* data_ptr) { + void initialize_graph_communicator(int num_send_ranks, int* send_rank_ids, int num_recv_ranks, int* recv_rank_ids){ -// #if MPI_VERSION >= 4 -// if (!has_persistent_neighbor) { -// std::cerr << "Error: Must call init_persistent_neighborhood() first!" << std::endl; -// std::cerr << " Falling back to standard neighborhood collective..." << std::endl; -// exchange_ghosts_neighborhood(data_ptr, persistent_num_fields); -// return; -// } + if(!has_comm_world){ + throw std::runtime_error("MPI communicator for the world has not been initialized"); + } -// // 1. Pack send buffer (same memory location as during init) -// for (size_t i = 0; i < send_ghost_lids.size(); i++) { -// int local_id = send_ghost_lids[i]; -// for (int f = 0; f < persistent_num_fields; f++) { -// ghost_send_buffer[i * persistent_num_fields + f] = -// data_ptr[local_id * persistent_num_fields + f]; -// } -// } - -// // 2. Start persistent request (VERY fast - no setup overhead) -// MPI_Start(&persistent_neighbor_request); - -// // 3. Wait for completion -// MPI_Wait(&persistent_neighbor_request, MPI_STATUS_IGNORE); - -// // 4. Unpack receive buffer -// for (size_t i = 0; i < recv_ghost_lids.size(); i++) { -// int ghost_id = recv_ghost_lids[i]; -// for (int f = 0; f < persistent_num_fields; f++) { -// data_ptr[ghost_id * persistent_num_fields + f] = -// ghost_recv_buffer[i * persistent_num_fields + f]; -// } -// } -// #else -// // Fallback to standard method if MPI-4 not available -// exchange_ghosts_neighborhood(data_ptr, persistent_num_fields); -// #endif -// } - - -// /** -// * @brief Free persistent neighborhood collective resources -// */ -// inline void free_persistent_neighborhood() { -// #if MPI_VERSION >= 4 -// if (has_persistent_neighbor && persistent_neighbor_request != MPI_REQUEST_NULL) { -// MPI_Request_free(&persistent_neighbor_request); -// persistent_neighbor_request = MPI_REQUEST_NULL; -// has_persistent_neighbor = false; -// } -// #endif -// } - -// }; + this->num_send_ranks = num_send_ranks; + this->num_recv_ranks = num_recv_ranks; + + this->send_rank_ids = DCArrayKokkos(num_send_ranks, "send_rank_ids"); + for(int i = 0; i < num_send_ranks; i++){ + this->send_rank_ids(i) = send_rank_ids[i]; + } + + + this->recv_rank_ids = DCArrayKokkos(num_recv_ranks, "recv_rank_ids"); + for(int i = 0; i < num_recv_ranks; i++){ + this->recv_rank_ids(i) = recv_rank_ids[i]; + } + + MPI_Dist_graph_create_adjacent( + mpi_comm_world, + num_recv_ranks, + this->recv_rank_ids.host_pointer(), + recv_weights, + num_send_ranks, + this->send_rank_ids.host_pointer(), + send_weights, + info, + reorder, + &mpi_comm_graph + ); + + has_comm_graph = true; + } + + void verify_graph_communicator(){ + if(!has_comm_graph){ + throw std::runtime_error("MPI graph communicator has not been initialized"); + } + + // ============================================================================ + // Verify the distributed graph communicator + // ============================================================================ + // Query the graph to verify it matches what we specified + int indegree_out, outdegree_out, weighted; + MPI_Dist_graph_neighbors_count(mpi_comm_graph, &indegree_out, &outdegree_out, &weighted); + + // Allocate arrays to receive neighbor information + std::vector sources_out(indegree_out); + std::vector sourceweights_out(indegree_out); + std::vector destinations_out(outdegree_out); + std::vector destweights_out(outdegree_out); + + // Retrieve the actual neighbors from the graph communicator + MPI_Dist_graph_neighbors(mpi_comm_graph, + indegree_out, sources_out.data(), sourceweights_out.data(), + outdegree_out, destinations_out.data(), destweights_out.data()); + + int rank = -1; + MPI_Comm_rank(mpi_comm_world, &rank); + + // Additional verification: Check if the queried values match our input + bool verification_passed = true; + + // Print verification information for each rank sequentially + for (int r = 0; r < world_size; ++r) { + MPI_Barrier(mpi_comm_world); + if (rank == r) { + std::cout << "\n[rank " << rank << "] Graph Communicator Verification:" << std::endl; + std::cout << " Indegree (receives from " << indegree_out << " ranks): "; + for (int i = 0; i < indegree_out; ++i) { + std::cout << sources_out[i] << " "; + } + std::cout << std::endl; + + std::cout << " Outdegree (sends to " << outdegree_out << " ranks): "; + for (int i = 0; i < outdegree_out; ++i) { + std::cout << destinations_out[i] << " "; + } + std::cout << std::endl; + + std::cout << " Weighted: " << (weighted ? "yes" : "no") << std::endl; + } + MPI_Barrier(mpi_comm_world); + } + + // Check if the counts match our stored values + if (indegree_out != num_recv_ranks) { + std::cerr << "[rank " << rank << "] ERROR: indegree mismatch! " + << "Expected " << num_recv_ranks << ", got " << indegree_out << std::endl; + verification_passed = false; + } + if (outdegree_out != num_send_ranks) { + std::cerr << "[rank " << rank << "] ERROR: outdegree mismatch! " + << "Expected " << num_send_ranks << ", got " << outdegree_out << std::endl; + verification_passed = false; + } + + // Check if source ranks match (build set from our stored recv_rank_ids) + std::set sources_set_in; + for (int i = 0; i < num_recv_ranks; ++i) { + sources_set_in.insert(recv_rank_ids.host(i)); + } + std::set sources_set_out(sources_out.begin(), sources_out.end()); + if (sources_set_in != sources_set_out) { + std::cerr << "[rank " << rank << "] ERROR: source ranks mismatch!" << std::endl; + verification_passed = false; + } + + // Check if destination ranks match (build set from our stored send_rank_ids) + std::set dests_set_in; + for (int i = 0; i < num_send_ranks; ++i) { + dests_set_in.insert(send_rank_ids.host(i)); + } + std::set dests_set_out(destinations_out.begin(), destinations_out.end()); + if (dests_set_in != dests_set_out) { + std::cerr << "[rank " << rank << "] ERROR: destination ranks mismatch!" << std::endl; + verification_passed = false; + } + + // Global verification check + int local_passed = verification_passed ? 1 : 0; + int global_passed = 0; + MPI_Allreduce(&local_passed, &global_passed, 1, MPI_INT, MPI_MIN, mpi_comm_world); + MPI_Barrier(mpi_comm_world); + if (rank == 0) { + if (global_passed) { + std::cout << "\n✓ Graph communicator verification PASSED on all ranks\n" << std::endl; + } else { + std::cout << "\n✗ Graph communicator verification FAILED on one or more ranks\n" << std::endl; + } + } + MPI_Barrier(mpi_comm_world); + } +}; diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h index 752b39e6..d4981a17 100644 --- a/examples/mesh_decomp/decomp_utils.h +++ b/examples/mesh_decomp/decomp_utils.h @@ -14,6 +14,7 @@ #include "mesh.h" #include "state.h" #include "mesh_io.h" +#include "communication_plan.h" // Include Scotch headers @@ -2077,13 +2078,14 @@ void partition_mesh( // ****************************************************************************************** // Create MPI distributed graph communicator for element communication // ****************************************************************************************** + + + CommunicationPlan element_communication_plan; + element_communication_plan.initialize(MPI_COMM_WORLD); // MPI_Dist_graph_create_adjacent creates a distributed graph topology communicator // that efficiently represents the communication pattern between ranks. // This allows MPI to optimize communication based on the actual connectivity pattern. - // ---------- Prepare input communicator ---------- - // comm_old: The base communicator from which to create the graph communicator - MPI_Comm comm_old = MPI_COMM_WORLD; // ---------- Prepare INCOMING edges (sources) ---------- // indegree: Number of ranks from which this rank will RECEIVE data @@ -2096,6 +2098,7 @@ void partition_mesh( // sources: Array of source rank IDs (ranks we receive from) // Each element corresponds to a rank that owns elements we ghost int* sources = (indegree > 0) ? ghost_elem_receive_ranks_vec.data() : MPI_UNWEIGHTED; + // sourceweights: Weights on incoming edges (not used here, set to MPI_UNWEIGHTED) // Could be used to specify communication volume if needed for optimization @@ -2109,132 +2112,22 @@ void partition_mesh( // destinations: Array of destination rank IDs (ranks we send to) // Each element corresponds to a rank that ghosts our owned elements int* destinations = (outdegree > 0) ? ghost_comm_ranks_vec.data() : MPI_UNWEIGHTED; - - // destweights: Weights on outgoing edges (not used here, set to MPI_UNWEIGHTED) - // Could be used to specify communication volume if needed for optimization - int* destweights = MPI_UNWEIGHTED; - - // ---------- Additional parameters ---------- - // info: Hints for optimization (MPI_INFO_NULL means use defaults) - MPI_Info info = MPI_INFO_NULL; - - // reorder: Whether to allow MPI to reorder ranks for optimization (0=no reordering) - // Setting to 0 preserves original rank numbering - int reorder = 0; - - // ---------- Output communicator ---------- - // graph_comm: The new distributed graph communicator that will be created - MPI_Comm graph_comm; - - // Create the distributed graph communicator - // This call collectively creates a communicator where each rank specifies: - // - Which ranks it receives from (sources/indegree) - // - Which ranks it sends to (destinations/outdegree) - // MPI can then optimize collective operations and point-to-point communication - // based on this connectivity information. - MPI_Dist_graph_create_adjacent( - comm_old, // Input: base communicator - indegree, // Input: number of incoming neighbors (ranks we receive from) - sources, // Input: array of source ranks [indegree elements] - sourceweights, // Input: weights on incoming edges (MPI_UNWEIGHTED) - outdegree, // Input: number of outgoing neighbors (ranks we send to) - destinations, // Input: array of destination ranks [outdegree elements] - destweights, // Input: weights on outgoing edges (MPI_UNWEIGHTED) - info, // Input: optimization hints (MPI_INFO_NULL) - reorder, // Input: allow rank reordering (0=no) - &graph_comm // Output: new distributed graph communicator - ); + // Initialize the graph communicator for element communication + element_communication_plan.initialize_graph_communicator(outdegree, ghost_comm_ranks_vec.data(), indegree, ghost_elem_receive_ranks_vec.data()); + // Optional: Verify the graph communicator was created successfully if (rank == 0) { std::cout << " Created MPI distributed graph communicator for element communication" << std::endl; } MPI_Barrier(MPI_COMM_WORLD); + + // ============================================================================ // Verify the distributed graph communicator // ============================================================================ - // Query the graph to verify it matches what we specified - int indegree_out, outdegree_out, weighted; - MPI_Dist_graph_neighbors_count(graph_comm, &indegree_out, &outdegree_out, &weighted); - - // Allocate arrays to receive neighbor information - std::vector sources_out(indegree_out); - std::vector sourceweights_out(indegree_out); - std::vector destinations_out(outdegree_out); - std::vector destweights_out(outdegree_out); - - // Retrieve the actual neighbors from the graph communicator - MPI_Dist_graph_neighbors(graph_comm, - indegree_out, sources_out.data(), sourceweights_out.data(), - outdegree_out, destinations_out.data(), destweights_out.data()); - - // Print verification information for each rank sequentially - for (int r = 0; r < world_size; ++r) { - MPI_Barrier(MPI_COMM_WORLD); - if (rank == r) { - std::cout << "\n[rank " << rank << "] Graph Communicator Verification:" << std::endl; - std::cout << " Indegree (receives from " << indegree_out << " ranks): "; - for (int i = 0; i < indegree_out; ++i) { - std::cout << sources_out[i] << " "; - } - std::cout << std::endl; - - std::cout << " Outdegree (sends to " << outdegree_out << " ranks): "; - for (int i = 0; i < outdegree_out; ++i) { - std::cout << destinations_out[i] << " "; - } - std::cout << std::endl; - - std::cout << " Weighted: " << (weighted ? "yes" : "no") << std::endl; - } - MPI_Barrier(MPI_COMM_WORLD); - } - - // Additional verification: Check if the queried values match our input - bool verification_passed = true; - if (indegree_out != indegree) { - std::cerr << "[rank " << rank << "] ERROR: indegree mismatch! " - << "Expected " << indegree << ", got " << indegree_out << std::endl; - verification_passed = false; - } - if (outdegree_out != outdegree) { - std::cerr << "[rank " << rank << "] ERROR: outdegree mismatch! " - << "Expected " << outdegree << ", got " << outdegree_out << std::endl; - verification_passed = false; - } - - // Check if source and destination ranks match (order may differ) - std::set sources_set_in(ghost_elem_receive_ranks_vec.begin(), ghost_elem_receive_ranks_vec.end()); - std::set sources_set_out(sources_out.begin(), sources_out.end()); - if (sources_set_in != sources_set_out) { - std::cerr << "[rank " << rank << "] ERROR: source ranks mismatch!" << std::endl; - verification_passed = false; - } - - std::set dests_set_in(ghost_comm_ranks_vec.begin(), ghost_comm_ranks_vec.end()); - std::set dests_set_out(destinations_out.begin(), destinations_out.end()); - if (dests_set_in != dests_set_out) { - std::cerr << "[rank " << rank << "] ERROR: destination ranks mismatch!" << std::endl; - verification_passed = false; - } - - // Global verification check - int local_passed = verification_passed ? 1 : 0; - int global_passed = 0; - MPI_Allreduce(&local_passed, &global_passed, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); - MPI_Barrier(MPI_COMM_WORLD); - if (rank == 0) { - if (global_passed) { - std::cout << "\n✓ Graph communicator verification PASSED on all ranks\n" << std::endl; - } else { - std::cout << "\n✗ Graph communicator verification FAILED on one or more ranks\n" << std::endl; - } - } - MPI_Barrier(MPI_COMM_WORLD); - - - + element_communication_plan.verify_graph_communicator(); // ****************************************************************************************** @@ -2262,8 +2155,8 @@ void partition_mesh( // - elem_sendcounts[i] = number of elements to send to i-th outgoing neighbor (destinations_out[i]) // - elem_sdispls[i] = starting position in send buffer for i-th outgoing neighbor - std::vector elem_sendcounts(outdegree_out, 0); - std::vector elem_sdispls(outdegree_out, 0); + std::vector elem_sendcounts(element_communication_plan.num_send_ranks, 0); + std::vector elem_sdispls(element_communication_plan.num_send_ranks, 0); // Count how many boundary elements go to each destination rank // boundary_elem_targets[elem_lid] contains pairs (dest_rank, elem_gid) for each boundary element @@ -2280,8 +2173,8 @@ void partition_mesh( // Fill elem_sendcounts based on the graph communicator's destination order int total_send = 0; - for (int i = 0; i < outdegree_out; i++) { - int dest_rank = destinations_out[i]; + for (int i = 0; i < element_communication_plan.num_send_ranks; i++) { + int dest_rank = element_communication_plan.send_rank_ids.host(i); elem_sendcounts[i] = static_cast(elems_to_send_by_rank[dest_rank].size()); elem_sdispls[i] = total_send; total_send += elem_sendcounts[i]; @@ -2293,8 +2186,8 @@ void partition_mesh( MPI_Barrier(MPI_COMM_WORLD); if (rank == r) { std::cout << "[rank " << rank << "] Send counts: "; - for (int i = 0; i < outdegree_out; i++) { - std::cout << "to_rank_" << destinations_out[i] << "=" << elem_sendcounts[i] << " "; + for (int i = 0; i < element_communication_plan.num_send_ranks; i++) { + std::cout << "to_rank_" << element_communication_plan.send_rank_ids.host(i) << "=" << elem_sendcounts[i] << " "; } std::cout << "(total=" << total_send << ")" << std::endl; } @@ -2306,8 +2199,8 @@ void partition_mesh( // - elem_recvcounts[i] = number of elements to receive from i-th incoming neighbor (sources_out[i]) // - elem_rdispls[i] = starting position in recv buffer for i-th incoming neighbor - std::vector elem_recvcounts(indegree_out, 0); - std::vector elem_rdispls(indegree_out, 0); + std::vector elem_recvcounts(element_communication_plan.num_recv_ranks, 0); + std::vector elem_rdispls(element_communication_plan.num_recv_ranks, 0); // Count how many ghost elements come from each source rank // ghost_elem_owner_ranks[i] tells us which rank owns the i-th ghost element @@ -2320,8 +2213,8 @@ void partition_mesh( // Fill elem_recvcounts based on the graph communicator's source order int total_recv = 0; - for (int i = 0; i < indegree_out; i++) { - int source_rank = sources_out[i]; + for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) { + int source_rank = element_communication_plan.recv_rank_ids.host(i); elem_recvcounts[i] = static_cast(elems_to_recv_by_rank[source_rank].size()); elem_rdispls[i] = total_recv; total_recv += elem_recvcounts[i]; @@ -2333,8 +2226,8 @@ void partition_mesh( MPI_Barrier(MPI_COMM_WORLD); if (rank == r) { std::cout << "[rank " << rank << "] Recv counts: "; - for (int i = 0; i < indegree_out; i++) { - std::cout << "from_rank_" << sources_out[i] << "=" << elem_recvcounts[i] << " "; + for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) { + std::cout << "from_rank_" << element_communication_plan.recv_rank_ids.host(i) << "=" << elem_recvcounts[i] << " "; } std::cout << "(total=" << total_recv << ", expected_ghosts=" << final_mesh.num_ghost_elems << ")" << std::endl; } @@ -2346,8 +2239,8 @@ void partition_mesh( std::vector elem_send_buffer(total_send); int send_idx = 0; - for (int i = 0; i < outdegree_out; i++) { - int dest_rank = destinations_out[i]; + for (int i = 0; i < element_communication_plan.num_send_ranks; i++) { + int dest_rank = element_communication_plan.send_rank_ids.host(i); const auto& elems_for_this_rank = elems_to_send_by_rank[dest_rank]; for (int elem_lid : elems_for_this_rank) { @@ -2373,7 +2266,7 @@ void partition_mesh( elem_recvcounts.data(), // Number of elements to receive from each incoming neighbor [indegree] elem_rdispls.data(), // Displacement in recv buffer for each incoming neighbor [indegree] MPI_DOUBLE, // Receive data type - graph_comm // Distributed graph communicator + element_communication_plan.mpi_comm_graph // Distributed graph communicator ); // ========== Update ghost element fields from receive buffer ========== @@ -2383,8 +2276,8 @@ void partition_mesh( std::vector ghost_updated(final_mesh.num_ghost_elems, false); int recv_idx = 0; - for (int i = 0; i < indegree_out; i++) { - int source_rank = sources_out[i]; + for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) { + int source_rank = element_communication_plan.recv_rank_ids.host(i); const auto& ghost_indices = elems_to_recv_by_rank[source_rank]; for (int ghost_idx : ghost_indices) { diff --git a/examples/mesh_decomp/mpi_type.h b/examples/mesh_decomp/mpi_type.h index 35b73985..f4731302 100644 --- a/examples/mesh_decomp/mpi_type.h +++ b/examples/mesh_decomp/mpi_type.h @@ -1,243 +1,243 @@ -// #ifndef MPIDARRAYKOKKOS_H -// #define MPIDARRAYKOKKOS_H - -// #include "matar.h" -// #include "communication_plan.h" - -// using namespace mtr; - -// ///////////////////////// -// // MPIDArrayKokkos: Dual type for managing distributed data on both CPU and GPU. -// // -// // Enhanced with automatic ghost synchronization via CommunicationPlan. -// // Allocates space for owned + ghost items and provides communicate() method. -// // -// // Usage: -// // node.coords.communicate() -> syncs ghost nodes automatically -// // elem.density.communicate() -> syncs ghost elements automatically -// ///////////////////////// -// template -// class MPIDArrayKokkos { - -// // this is manage -// using TArray1D = Kokkos::DualView ; +#ifndef MPIDARRAYKOKKOS_H +#define MPIDARRAYKOKKOS_H + +#include "matar.h" +#include "communication_plan.h" + +using namespace mtr; + +///////////////////////// +// MPIDArrayKokkos: Dual type for managing distributed data on both CPU and GPU. +// +// Enhanced with automatic ghost synchronization via CommunicationPlan. +// Allocates space for owned + ghost items and provides communicate() method. +// +// Usage: +// node.coords.communicate() -> syncs ghost nodes automatically +// elem.density.communicate() -> syncs ghost elements automatically +///////////////////////// +template +class MPIDArrayKokkos { + + // this is manage + using TArray1D = Kokkos::DualView ; -// protected: -// size_t dims_[7]; -// size_t length_; -// size_t order_; // tensor order (rank) -// int mpi_recv_rank_; -// int mpi_tag_; -// MPI_Comm mpi_comm_; -// MPI_Status mpi_status_; -// MPI_Datatype mpi_datatype_; -// MPI_Request mpi_request_; -// TArray1D this_array_; +protected: + size_t dims_[7]; + size_t length_; + size_t order_; // tensor order (rank) + int mpi_recv_rank_; + int mpi_tag_; + MPI_Comm mpi_comm_; + MPI_Status mpi_status_; + MPI_Datatype mpi_datatype_; + MPI_Request mpi_request_; + TArray1D this_array_; -// // --- Ghost Communication Support --- -// CommunicationPlan* comm_plan_; // Pointer to shared communication plan -// size_t num_owned_items_; // Number of owned items (nodes/elements) -// size_t num_total_items_; // Total items including ghosts (owned + ghost) -// size_t num_fields_; // Fields per item (e.g., 3 for 3D coordinates) + // --- Ghost Communication Support --- + CommunicationPlan* comm_plan_; // Pointer to shared communication plan + size_t num_owned_items_; // Number of owned items (nodes/elements) + size_t num_total_items_; // Total items including ghosts (owned + ghost) + size_t num_fields_; // Fields per item (e.g., 3 for 3D coordinates) -// void set_mpi_type(); + void set_mpi_type(); -// public: -// // Data member to access host view -// ViewCArray host; +public: + // Data member to access host view + ViewCArray host; -// MPIDArrayKokkos(); + MPIDArrayKokkos(); -// MPIDArrayKokkos(size_t dim0, const std::string& tag_string = DEFAULTSTRINGARRAY); + MPIDArrayKokkos(size_t dim0, const std::string& tag_string = DEFAULTSTRINGARRAY); -// MPIDArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY); + MPIDArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY); -// MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY); + MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY); -// MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, -// size_t dim3, const std::string& tag_string = DEFAULTSTRINGARRAY); + MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, + size_t dim3, const std::string& tag_string = DEFAULTSTRINGARRAY); -// MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, -// size_t dim3, size_t dim4, const std::string& tag_string = DEFAULTSTRINGARRAY); + MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, + size_t dim3, size_t dim4, const std::string& tag_string = DEFAULTSTRINGARRAY); -// MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, -// size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string = DEFAULTSTRINGARRAY); + MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, + size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string = DEFAULTSTRINGARRAY); -// MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, -// size_t dim3, size_t dim4, size_t dim5, -// size_t dim6, const std::string& tag_string = DEFAULTSTRINGARRAY); + MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, + size_t dim3, size_t dim4, size_t dim5, + size_t dim6, const std::string& tag_string = DEFAULTSTRINGARRAY); -// // ======================================================================== -// // DISTRIBUTED COMMUNICATION METHODS (NEW) -// // ======================================================================== + // ======================================================================== + // DISTRIBUTED COMMUNICATION METHODS (NEW) + // ======================================================================== -// /** -// * @brief Set communication plan and ghost metadata -// * -// * Call this ONCE after allocating the array to enable ghost communication. -// * Multiple fields can share the same CommunicationPlan pointer. -// * -// * @param plan Pointer to shared CommunicationPlan (node or element plan) -// * @param num_owned Number of owned items on this rank -// * @param num_total Total items including ghosts (owned + ghost) -// * -// * Example: -// * node.coords = MPIDArrayKokkos(num_total_nodes, 3); -// * node.coords.set_communication_plan(&node_comm_plan, num_owned_nodes, num_total_nodes); -// */ -// void set_communication_plan(CommunicationPlan* plan, size_t num_owned, size_t num_total); + /** + * @brief Set communication plan and ghost metadata + * + * Call this ONCE after allocating the array to enable ghost communication. + * Multiple fields can share the same CommunicationPlan pointer. + * + * @param plan Pointer to shared CommunicationPlan (node or element plan) + * @param num_owned Number of owned items on this rank + * @param num_total Total items including ghosts (owned + ghost) + * + * Example: + * node.coords = MPIDArrayKokkos(num_total_nodes, 3); + * node.coords.set_communication_plan(&node_comm_plan, num_owned_nodes, num_total_nodes); + */ + void set_communication_plan(CommunicationPlan* plan, size_t num_owned, size_t num_total); -// /** -// * @brief Synchronize ghost data using neighborhood collectives -// * -// * Automatically exchanges boundary → ghost data for this field. -// * Uses the CommunicationPlan provided via set_communication_plan(). -// * -// * Workflow: -// * 1. Updates host data from device (if needed) -// * 2. Packs owned boundary items -// * 3. Calls MPI_Neighbor_alltoallv (via comm_plan) -// * 4. Unpacks into ghost items -// * 5. Updates device with new ghost data -// * -// * Example usage: -// * // Update owned nodes -// * for (int i = 0; i < num_owned_nodes; i++) { -// * node.coords(i, 0) += dt * velocity(i, 0); -// * } -// * -// * // Sync ghosts -// * node.coords.communicate(); -// * -// * // Now ghost data is current -// */ -// void communicate(); + /** + * @brief Synchronize ghost data using neighborhood collectives + * + * Automatically exchanges boundary → ghost data for this field. + * Uses the CommunicationPlan provided via set_communication_plan(). + * + * Workflow: + * 1. Updates host data from device (if needed) + * 2. Packs owned boundary items + * 3. Calls MPI_Neighbor_alltoallv (via comm_plan) + * 4. Unpacks into ghost items + * 5. Updates device with new ghost data + * + * Example usage: + * // Update owned nodes + * for (int i = 0; i < num_owned_nodes; i++) { + * node.coords(i, 0) += dt * velocity(i, 0); + * } + * + * // Sync ghosts + * node.coords.communicate(); + * + * // Now ghost data is current + */ + void communicate(); -// /** -// * @brief Non-blocking version: start ghost exchange -// * -// * For advanced users who want to overlap computation with communication. -// * Must call communicate_wait() before accessing ghost data. -// */ -// void communicate_begin(); + /** + * @brief Non-blocking version: start ghost exchange + * + * For advanced users who want to overlap computation with communication. + * Must call communicate_wait() before accessing ghost data. + */ + void communicate_begin(); -// /** -// * @brief Wait for non-blocking ghost exchange to complete -// */ -// void communicate_wait(); + /** + * @brief Wait for non-blocking ghost exchange to complete + */ + void communicate_wait(); -// /** -// * @brief Get number of owned items (excludes ghosts) -// */ -// KOKKOS_INLINE_FUNCTION -// size_t num_owned() const { return num_owned_items_; } + /** + * @brief Get number of owned items (excludes ghosts) + */ + KOKKOS_INLINE_FUNCTION + size_t num_owned() const { return num_owned_items_; } -// /** -// * @brief Get total items including ghosts -// */ -// KOKKOS_INLINE_FUNCTION -// size_t num_total() const { return num_total_items_; } + /** + * @brief Get total items including ghosts + */ + KOKKOS_INLINE_FUNCTION + size_t num_total() const { return num_total_items_; } -// /** -// * @brief Check if ghost communication is configured -// */ -// bool has_communication_plan() const { return comm_plan_ != nullptr; } + /** + * @brief Check if ghost communication is configured + */ + bool has_communication_plan() const { return comm_plan_ != nullptr; } -// // These functions can setup the data needed for halo send/receives -// // Not necessary for standard MPI comms -// void mpi_setup(); + // These functions can setup the data needed for halo send/receives + // Not necessary for standard MPI comms + void mpi_setup(); -// void mpi_setup(int recv_rank); + void mpi_setup(int recv_rank); -// void mpi_setup(int recv_rank, int tag); + void mpi_setup(int recv_rank, int tag); -// void mpi_setup(int recv_rank, int tag, MPI_Comm comm); + void mpi_setup(int recv_rank, int tag, MPI_Comm comm); -// void mpi_set_rank(int recv_rank); + void mpi_set_rank(int recv_rank); -// void mpi_set_tag(int tag); + void mpi_set_tag(int tag); -// void mpi_set_comm(MPI_Comm comm); + void mpi_set_comm(MPI_Comm comm); -// int get_rank(); + int get_rank(); -// int get_tag(); + int get_tag(); -// MPI_Comm get_comm(); + MPI_Comm get_comm(); -// KOKKOS_INLINE_FUNCTION -// T& operator()(size_t i) const; + KOKKOS_INLINE_FUNCTION + T& operator()(size_t i) const; -// KOKKOS_INLINE_FUNCTION -// T& operator()(size_t i, size_t j) const; + KOKKOS_INLINE_FUNCTION + T& operator()(size_t i, size_t j) const; -// KOKKOS_INLINE_FUNCTION -// T& operator()(size_t i, size_t j, size_t k) const; + KOKKOS_INLINE_FUNCTION + T& operator()(size_t i, size_t j, size_t k) const; -// KOKKOS_INLINE_FUNCTION -// T& operator()(size_t i, size_t j, size_t k, size_t l) const; + KOKKOS_INLINE_FUNCTION + T& operator()(size_t i, size_t j, size_t k, size_t l) const; -// KOKKOS_INLINE_FUNCTION -// T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m) const; + KOKKOS_INLINE_FUNCTION + T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m) const; -// KOKKOS_INLINE_FUNCTION -// T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m, -// size_t n) const; + KOKKOS_INLINE_FUNCTION + T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m, + size_t n) const; -// KOKKOS_INLINE_FUNCTION -// T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m, -// size_t n, size_t o) const; + KOKKOS_INLINE_FUNCTION + T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m, + size_t n, size_t o) const; -// KOKKOS_INLINE_FUNCTION -// MPIDArrayKokkos& operator=(const MPIDArrayKokkos& temp); + KOKKOS_INLINE_FUNCTION + MPIDArrayKokkos& operator=(const MPIDArrayKokkos& temp); -// // GPU Method -// // Method that returns size -// KOKKOS_INLINE_FUNCTION -// size_t size() const; + // GPU Method + // Method that returns size + KOKKOS_INLINE_FUNCTION + size_t size() const; -// // Host Method -// // Method that returns size -// KOKKOS_INLINE_FUNCTION -// size_t extent() const; + // Host Method + // Method that returns size + KOKKOS_INLINE_FUNCTION + size_t extent() const; -// KOKKOS_INLINE_FUNCTION -// size_t dims(size_t i) const; + KOKKOS_INLINE_FUNCTION + size_t dims(size_t i) const; -// KOKKOS_INLINE_FUNCTION -// size_t order() const; + KOKKOS_INLINE_FUNCTION + size_t order() const; -// // Method returns the raw device pointer of the Kokkos DualView -// KOKKOS_INLINE_FUNCTION -// T* device_pointer() const; + // Method returns the raw device pointer of the Kokkos DualView + KOKKOS_INLINE_FUNCTION + T* device_pointer() const; -// // Method returns the raw host pointer of the Kokkos DualView -// KOKKOS_INLINE_FUNCTION -// T* host_pointer() const; + // Method returns the raw host pointer of the Kokkos DualView + KOKKOS_INLINE_FUNCTION + T* host_pointer() const; -// // Method returns kokkos dual view -// KOKKOS_INLINE_FUNCTION -// TArray1D get_kokkos_dual_view() const; + // Method returns kokkos dual view + KOKKOS_INLINE_FUNCTION + TArray1D get_kokkos_dual_view() const; -// // Method that update host view -// void update_host(); + // Method that update host view + void update_host(); -// // Method that update device view -// void update_device(); + // Method that update device view + void update_device(); -// // Deconstructor -// virtual KOKKOS_INLINE_FUNCTION -// ~MPIDArrayKokkos (); -// }; // End of MPIDArrayKokkos + // Deconstructor + virtual KOKKOS_INLINE_FUNCTION + ~MPIDArrayKokkos (); +}; // End of MPIDArrayKokkos // // ============================================================================ From 276dba78da2f8586543341357c22bc2b0f6693df Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Wed, 5 Nov 2025 15:13:23 -0600 Subject: [PATCH 19/52] ENH: Fleshing out MPI type and communication plan --- examples/mesh_decomp/communication_plan.h | 86 ++++ examples/mesh_decomp/decomp_utils.h | 201 ++++----- examples/mesh_decomp/mesh_decomp.cpp | 2 +- examples/mesh_decomp/mpi_type.h | 486 +++++++++++----------- 4 files changed, 428 insertions(+), 347 deletions(-) diff --git a/examples/mesh_decomp/communication_plan.h b/examples/mesh_decomp/communication_plan.h index 1c95a40a..b49befb8 100644 --- a/examples/mesh_decomp/communication_plan.h +++ b/examples/mesh_decomp/communication_plan.h @@ -51,6 +51,25 @@ // This is a good optimization for large meshes, but will require maps from MPI_comm_world rank IDs to the new reordered rank IDs. int reorder = 0; + + + + + DRaggedRightArrayKokkos send_indices_; // [size: num_send_ranks, num_items_to_send_per_rank] Indices of items to send to each rank + DRaggedRightArrayKokkos recv_indices_; // [size: num_recv_ranks, num_items_to_recv_per_rank] Indices of items to receive from each rank + + DCArrayKokkos send_counts_; // [size: num_send_ranks] Number of items to send to each rank + DCArrayKokkos recv_counts_; // [size: num_recv_ranks] Number of items to receive from each rank + + + DCArrayKokkos send_displs_; // [size: num_send_ranks] Starting index of items to send to each rank + DCArrayKokkos recv_displs_; // [size: num_recv_ranks] Starting index of items to receive from each rank + + int total_send_count; + int total_recv_count; + + + // ======================================================================== // CONSTRUCTOR / INITIALIZATION @@ -213,4 +232,71 @@ } + void setup_send_recv(DRaggedRightArrayKokkos &rank_send_ids, DRaggedRightArrayKokkos &rank_recv_ids){ + + this->send_indices_ = rank_send_ids; + this->recv_indices_ = rank_recv_ids; + + + // Setup send data + this->send_counts_ = DCArrayKokkos(num_send_ranks, "send_counts"); + this->total_send_count = 0; + for(int i = 0; i < num_send_ranks; i++){ + this->send_counts_.host(i) = rank_send_ids.stride_host(i); + this->total_send_count += this->send_counts_.host(i); + } + this->send_counts_.update_device(); + + this->send_displs_ = DCArrayKokkos(num_send_ranks, "send_displs"); + for(int i = 0; i < num_send_ranks; i++){ + this->send_displs_.host(i) = 0; + for(int j = 0; j < i; j++){ + this->send_displs_.host(i) += this->send_counts_.host(j); + } + } + this->send_displs_.update_device(); + + // Setup recv data + this->recv_counts_ = DCArrayKokkos(num_recv_ranks, "recv_counts"); + this->total_recv_count = 0; + for(int i = 0; i < num_recv_ranks; i++){ + this->recv_counts_.host(i) = rank_recv_ids.stride_host(i); + this->total_recv_count += this->recv_counts_.host(i); + } + this->recv_counts_.update_device(); + + this->recv_displs_ = DCArrayKokkos(num_recv_ranks, "recv_displs"); + for(int i = 0; i < num_recv_ranks; i++){ + this->recv_displs_.host(i) = 0; + for(int j = 0; j < i; j++){ + this->recv_displs_.host(i) += this->recv_counts_.host(j); + } + } + this->recv_displs_.update_device(); + + + // Print the send and recv data sequentially per MPI rank for clarity + MPI_Barrier(mpi_comm_world); + int rank, nprocs; + MPI_Comm_rank(mpi_comm_world, &rank); + MPI_Comm_size(mpi_comm_world, &nprocs); + for(int r = 0; r < nprocs; r++) { + MPI_Barrier(mpi_comm_world); + if(rank == r) { + std::cout << "==============================" << std::endl; + std::cout << "CommunicationPlan info for rank " << rank << std::endl; + for(int i = 0; i < num_send_ranks; i++){ + std::cout << " Send count to rank[" << i << "] (dest rank " << this->send_rank_ids.host(i) << "): " << this->send_counts_.host(i) << std::endl; + std::cout << " Send displs to rank[" << i << "]: " << this->send_displs_.host(i) << std::endl; + } + for(int i = 0; i < num_recv_ranks; i++){ + std::cout << " Recv count from rank[" << i << "] (source rank " << this->recv_rank_ids.host(i) << "): " << this->recv_counts_.host(i) << std::endl; + std::cout << " Recv displs from rank[" << i << "]: " << this->recv_displs_.host(i) << std::endl; + } + std::cout << "==============================" << std::endl << std::flush; + } + } + MPI_Barrier(mpi_comm_world); + } + }; diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h index d4981a17..2a256deb 100644 --- a/examples/mesh_decomp/decomp_utils.h +++ b/examples/mesh_decomp/decomp_utils.h @@ -717,9 +717,9 @@ void partition_mesh( - if (print_vtk) { - write_vtk(naive_mesh, naive_node, rank); - } + // if (print_vtk) { + // write_vtk(naive_mesh, naive_node, rank); + // } @@ -2076,7 +2076,7 @@ void partition_mesh( // ****************************************************************************************** -// Create MPI distributed graph communicator for element communication +// Create Communication Plan for element communication // ****************************************************************************************** @@ -2115,48 +2115,22 @@ void partition_mesh( // Initialize the graph communicator for element communication element_communication_plan.initialize_graph_communicator(outdegree, ghost_comm_ranks_vec.data(), indegree, ghost_elem_receive_ranks_vec.data()); - - // Optional: Verify the graph communicator was created successfully - if (rank == 0) { - std::cout << " Created MPI distributed graph communicator for element communication" << std::endl; - } MPI_Barrier(MPI_COMM_WORLD); - - - - // ============================================================================ - // Verify the distributed graph communicator - // ============================================================================ - element_communication_plan.verify_graph_communicator(); + // Optional: Verify the graph communicator was created successfully + if(print_info) element_communication_plan.verify_graph_communicator(); // ****************************************************************************************** -// Test element communication using MPI_Neighbor_alltoallv +// Build send counts and displacements for element communication // ****************************************************************************************** - // Gauss points share the same communication plan as elements. - // This test initializes gauss point fields on owned elements and exchanges them with ghost elements. - - print_info = true; // Enable debug output for communication test - - gauss_point.initialize(final_mesh.num_elems, 1, {gauss_pt_state::fields}); - - // Initialize the gauss point fields on each rank - // Set owned elements to rank number, ghost elements to -1 (to verify communication) - for (int i = 0; i < final_mesh.num_owned_elems; i++) { - gauss_point.fields.host(i) = static_cast(rank); - } - for (int i = final_mesh.num_owned_elems; i < final_mesh.num_elems; i++) { - gauss_point.fields.host(i) = -1.0; // Ghost elements should be updated - } - gauss_point.fields.update_device(); - // ========== Build send counts and displacements for OUTGOING neighbors (destinations) ========== + // ========== Build send counts and displacements for OUTGOING neighbors (destinations) ========== // For MPI_Neighbor_alltoallv with graph communicator: // - elem_sendcounts[i] = number of elements to send to i-th outgoing neighbor (destinations_out[i]) // - elem_sdispls[i] = starting position in send buffer for i-th outgoing neighbor - std::vector elem_sendcounts(element_communication_plan.num_send_ranks, 0); - std::vector elem_sdispls(element_communication_plan.num_send_ranks, 0); + // std::vector elem_sendcounts(element_communication_plan.num_send_ranks, 0); + // std::vector elem_sdispls(element_communication_plan.num_send_ranks, 0); // Count how many boundary elements go to each destination rank // boundary_elem_targets[elem_lid] contains pairs (dest_rank, elem_gid) for each boundary element @@ -2170,37 +2144,24 @@ void partition_mesh( } } } - - // Fill elem_sendcounts based on the graph communicator's destination order - int total_send = 0; + + // Serialize into a DRaggedRightArrayKokkos + CArrayKokkos strides_array(element_communication_plan.num_send_ranks); for (int i = 0; i < element_communication_plan.num_send_ranks; i++) { int dest_rank = element_communication_plan.send_rank_ids.host(i); - elem_sendcounts[i] = static_cast(elems_to_send_by_rank[dest_rank].size()); - elem_sdispls[i] = total_send; - total_send += elem_sendcounts[i]; + strides_array(i) = elems_to_send_by_rank[dest_rank].size(); } - - // Debug: Print send counts - if (print_info) { - for (int r = 0; r < world_size; ++r) { - MPI_Barrier(MPI_COMM_WORLD); - if (rank == r) { - std::cout << "[rank " << rank << "] Send counts: "; - for (int i = 0; i < element_communication_plan.num_send_ranks; i++) { - std::cout << "to_rank_" << element_communication_plan.send_rank_ids.host(i) << "=" << elem_sendcounts[i] << " "; - } - std::cout << "(total=" << total_send << ")" << std::endl; - } - MPI_Barrier(MPI_COMM_WORLD); + DRaggedRightArrayKokkos elems_to_send_by_rank_rr(strides_array, "elems_to_send_by_rank"); + + // Fill in the data + for (int i = 0; i < element_communication_plan.num_send_ranks; i++) { + int dest_rank = element_communication_plan.send_rank_ids.host(i); + for (int j = 0; j < elems_to_send_by_rank[dest_rank].size(); j++) { + elems_to_send_by_rank_rr.host(i, j) = elems_to_send_by_rank[dest_rank][j]; } } - - // ========== Build receive counts and displacements for INCOMING neighbors (sources) ========== - // - elem_recvcounts[i] = number of elements to receive from i-th incoming neighbor (sources_out[i]) - // - elem_rdispls[i] = starting position in recv buffer for i-th incoming neighbor - - std::vector elem_recvcounts(element_communication_plan.num_recv_ranks, 0); - std::vector elem_rdispls(element_communication_plan.num_recv_ranks, 0); + elems_to_send_by_rank_rr.update_device(); + // Count how many ghost elements come from each source rank // ghost_elem_owner_ranks[i] tells us which rank owns the i-th ghost element @@ -2210,61 +2171,102 @@ void partition_mesh( int source_rank = ghost_elem_owner_ranks[i]; elems_to_recv_by_rank[source_rank].push_back(static_cast(i)); } - - // Fill elem_recvcounts based on the graph communicator's source order - int total_recv = 0; + + // ========== Serialize into a DRaggedRightArrayKokkos ========== + CArrayKokkos elem_recv_strides_array(element_communication_plan.num_recv_ranks); for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) { int source_rank = element_communication_plan.recv_rank_ids.host(i); - elem_recvcounts[i] = static_cast(elems_to_recv_by_rank[source_rank].size()); - elem_rdispls[i] = total_recv; - total_recv += elem_recvcounts[i]; + elem_recv_strides_array(i) = elems_to_recv_by_rank[source_rank].size(); } - - // Debug: Print receive counts - if (print_info) { - for (int r = 0; r < world_size; ++r) { - MPI_Barrier(MPI_COMM_WORLD); - if (rank == r) { - std::cout << "[rank " << rank << "] Recv counts: "; - for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) { - std::cout << "from_rank_" << element_communication_plan.recv_rank_ids.host(i) << "=" << elem_recvcounts[i] << " "; - } - std::cout << "(total=" << total_recv << ", expected_ghosts=" << final_mesh.num_ghost_elems << ")" << std::endl; - } - MPI_Barrier(MPI_COMM_WORLD); + DRaggedRightArrayKokkos elems_to_recv_by_rank_rr(elem_recv_strides_array, "elems_to_recv_by_rank"); + // Fill in the data + for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) { + int source_rank = element_communication_plan.recv_rank_ids.host(i); + for (int j = 0; j < elems_to_recv_by_rank[source_rank].size(); j++) { + elems_to_recv_by_rank_rr.host(i, j) = elems_to_recv_by_rank[source_rank][j]; } } + elems_to_recv_by_rank_rr.update_device(); + + element_communication_plan.setup_send_recv(elems_to_send_by_rank_rr, elems_to_recv_by_rank_rr); + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout << " Finished building the send and recv counts and displacements for element communication" << std::endl; + MPI_Barrier(MPI_COMM_WORLD); + +// ****************************************************************************************** +// Test element communication using MPI_Neighbor_alltoallv +// ****************************************************************************************** + // Gauss points share the same communication plan as elements. + // This test initializes gauss point fields on owned elements and exchanges them with ghost elements. + + print_info = true; // Enable debug output for communication test + + gauss_point.initialize(final_mesh.num_elems, 1, {gauss_pt_state::fields}); // , &element_communication_plan + + // Initialize the gauss point fields on each rank + // Set owned elements to rank number, ghost elements to -1 (to verify communication) + for (int i = 0; i < final_mesh.num_owned_elems; i++) { + gauss_point.fields.host(i) = static_cast(rank); + } + for (int i = final_mesh.num_owned_elems; i < final_mesh.num_elems; i++) { + gauss_point.fields.host(i) = -1.0; // Ghost elements should be updated + } + gauss_point.fields.update_device(); + + + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout << " Starting to build the send buffer for element communication" << std::endl; + MPI_Barrier(MPI_COMM_WORLD); // ========== Build send buffer organized by destination rank ========== - std::vector elem_send_buffer(total_send); + std::vector elem_send_buffer(element_communication_plan.total_send_count); int send_idx = 0; for (int i = 0; i < element_communication_plan.num_send_ranks; i++) { - int dest_rank = element_communication_plan.send_rank_ids.host(i); - const auto& elems_for_this_rank = elems_to_send_by_rank[dest_rank]; + // Get the number of elements to send to this neighbor + size_t num_elems_to_send = elems_to_send_by_rank_rr.stride_host(i); + if(rank == 0) std::cout << " Sending " << num_elems_to_send << " elements to rank " << element_communication_plan.send_rank_ids.host(i) << std::endl; - for (int elem_lid : elems_for_this_rank) { - elem_send_buffer[send_idx++] = gauss_point.fields.host(elem_lid); + for (size_t j = 0; j < num_elems_to_send; j++) { + int elem_lid = elems_to_send_by_rank_rr.host(i, j); + if(rank == 0) std::cout << " Sending element " << elem_lid << std::endl; + double value = gauss_point.fields.host(elem_lid); + + if(rank == 0) std::cout << " Value: " << value << std::endl; + elem_send_buffer[send_idx++] = value; } } + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout << " Finished building the send buffer for element communication" << std::endl; + MPI_Barrier(MPI_COMM_WORLD); // ========== Allocate receive buffer ========== - std::vector elem_recv_buffer(total_recv); + std::vector elem_recv_buffer(element_communication_plan.total_recv_count); + + + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout << " Finished building the receive buffer for element communication" << std::endl; + MPI_Barrier(MPI_COMM_WORLD); + + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout << " Starting to exchange element data using MPI_Neighbor_alltoallv" << std::endl; + MPI_Barrier(MPI_COMM_WORLD); - // ========== Exchange data using MPI_Neighbor_alltoallv ========== - // MPI_Neighbor_alltoallv exchanges data with neighbors in the graph communicator topology - // - elem_sendcounts[i]: number of doubles to send to i-th outgoing neighbor - // - elem_recvcounts[i]: number of doubles to receive from i-th incoming neighbor - // - The order of neighbors must match the order returned by MPI_Dist_graph_neighbors + MPI_Neighbor_alltoallv( elem_send_buffer.data(), // Send buffer with boundary element data - elem_sendcounts.data(), // Number of elements to send to each outgoing neighbor [outdegree] - elem_sdispls.data(), // Displacement in send buffer for each outgoing neighbor [outdegree] + element_communication_plan.send_counts_.host_pointer(), // Number of elements to send to each outgoing neighbor [outdegree] + element_communication_plan.send_displs_.host_pointer(), // Displacement in send buffer for each outgoing neighbor [outdegree] MPI_DOUBLE, // Send data type elem_recv_buffer.data(), // Receive buffer for ghost element data - elem_recvcounts.data(), // Number of elements to receive from each incoming neighbor [indegree] - elem_rdispls.data(), // Displacement in recv buffer for each incoming neighbor [indegree] + element_communication_plan.recv_counts_.host_pointer(), // Number of elements to receive from each incoming neighbor [indegree] + element_communication_plan.recv_displs_.host_pointer(), // Displacement in recv buffer for each incoming neighbor [indegree] MPI_DOUBLE, // Receive data type element_communication_plan.mpi_comm_graph // Distributed graph communicator ); @@ -2277,10 +2279,11 @@ void partition_mesh( int recv_idx = 0; for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) { - int source_rank = element_communication_plan.recv_rank_ids.host(i); - const auto& ghost_indices = elems_to_recv_by_rank[source_rank]; + // Get the number of ghost elements from this source rank + size_t num_ghosts_from_source = elems_to_recv_by_rank_rr.stride_host(i); - for (int ghost_idx : ghost_indices) { + for (size_t j = 0; j < num_ghosts_from_source; j++) { + int ghost_idx = elems_to_recv_by_rank_rr.host(i, j); int ghost_elem_local_id = final_mesh.num_owned_elems + ghost_idx; gauss_point.fields.host(ghost_elem_local_id) = elem_recv_buffer[recv_idx++]; ghost_updated[ghost_idx] = true; diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp index b14ee9cd..e1383ccb 100644 --- a/examples/mesh_decomp/mesh_decomp.cpp +++ b/examples/mesh_decomp/mesh_decomp.cpp @@ -34,7 +34,7 @@ int main(int argc, char** argv) { // Mesh size double origin[3] = {0.0, 0.0, 0.0}; double length[3] = {1.0, 1.0, 1.0}; - int num_elems_dim[3] = {50, 50, 50}; + int num_elems_dim[3] = {20, 20, 20}; // Initial mesh built on rank zero Mesh_t initial_mesh; diff --git a/examples/mesh_decomp/mpi_type.h b/examples/mesh_decomp/mpi_type.h index f4731302..636a1bd0 100644 --- a/examples/mesh_decomp/mpi_type.h +++ b/examples/mesh_decomp/mpi_type.h @@ -1,5 +1,5 @@ -#ifndef MPIDARRAYKOKKOS_H -#define MPIDARRAYKOKKOS_H +#ifndef MPICARRAYKOKKOS_H +#define MPICARRAYKOKKOS_H #include "matar.h" #include "communication_plan.h" @@ -7,38 +7,42 @@ using namespace mtr; ///////////////////////// -// MPIDArrayKokkos: Dual type for managing distributed data on both CPU and GPU. +// MPICArrayKokkos: Dual type for managing distributed data on both CPU and GPU. // -// Enhanced with automatic ghost synchronization via CommunicationPlan. -// Allocates space for owned + ghost items and provides communicate() method. -// -// Usage: -// node.coords.communicate() -> syncs ghost nodes automatically -// elem.density.communicate() -> syncs ghost elements automatically ///////////////////////// template -class MPIDArrayKokkos { +class MPICArrayKokkos { - // this is manage - using TArray1D = Kokkos::DualView ; + // Dual view for managing data on both CPU and GPU + DCArrayKokkos this_array_; protected: size_t dims_[7]; size_t length_; size_t order_; // tensor order (rank) - int mpi_recv_rank_; - int mpi_tag_; + MPI_Comm mpi_comm_; MPI_Status mpi_status_; MPI_Datatype mpi_datatype_; MPI_Request mpi_request_; - TArray1D this_array_; + // --- Ghost Communication Support --- CommunicationPlan* comm_plan_; // Pointer to shared communication plan - size_t num_owned_items_; // Number of owned items (nodes/elements) - size_t num_total_items_; // Total items including ghosts (owned + ghost) - size_t num_fields_; // Fields per item (e.g., 3 for 3D coordinates) + + + DCArrayKokkos send_counts_; // [size: num_send_ranks] Number of items to send to each rank + DCArrayKokkos recv_counts_; // [size: num_recv_ranks] Number of items to receive from each rank + DCArrayKokkos send_displs_; // [size: num_send_ranks] Starting index of items to send to each rank + DCArrayKokkos recv_displs_; // [size: num_recv_ranks] Starting index of items to receive from each rank + + + DRaggedRightArrayKokkos send_indices_; // [size: num_send_ranks, num_items_to_send_by_rank] Indices of items to send to each rank + DRaggedRightArrayKokkos recv_indices_; // [size: num_recv_ranks, num_items_to_recv_by_rank] Indices of items to receive from each rank + + + size_t num_owned_; // Number of owned items (nodes/elements) + size_t num_ghost_; // Number of ghost items (nodes/elements) void set_mpi_type(); @@ -46,131 +50,28 @@ class MPIDArrayKokkos { // Data member to access host view ViewCArray host; - MPIDArrayKokkos(); + MPICArrayKokkos(); - MPIDArrayKokkos(size_t dim0, const std::string& tag_string = DEFAULTSTRINGARRAY); + MPICArrayKokkos(size_t dim0, const std::string& tag_string = DEFAULTSTRINGARRAY); - MPIDArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY); + MPICArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY); - MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY); + MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY); - MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, + MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, const std::string& tag_string = DEFAULTSTRINGARRAY); - MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, + MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, const std::string& tag_string = DEFAULTSTRINGARRAY); - MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, + MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string = DEFAULTSTRINGARRAY); - MPIDArrayKokkos(size_t dim0, size_t dim1, size_t dim2, + MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, size_t dim6, const std::string& tag_string = DEFAULTSTRINGARRAY); - - // ======================================================================== - // DISTRIBUTED COMMUNICATION METHODS (NEW) - // ======================================================================== - - /** - * @brief Set communication plan and ghost metadata - * - * Call this ONCE after allocating the array to enable ghost communication. - * Multiple fields can share the same CommunicationPlan pointer. - * - * @param plan Pointer to shared CommunicationPlan (node or element plan) - * @param num_owned Number of owned items on this rank - * @param num_total Total items including ghosts (owned + ghost) - * - * Example: - * node.coords = MPIDArrayKokkos(num_total_nodes, 3); - * node.coords.set_communication_plan(&node_comm_plan, num_owned_nodes, num_total_nodes); - */ - void set_communication_plan(CommunicationPlan* plan, size_t num_owned, size_t num_total); - - - /** - * @brief Synchronize ghost data using neighborhood collectives - * - * Automatically exchanges boundary → ghost data for this field. - * Uses the CommunicationPlan provided via set_communication_plan(). - * - * Workflow: - * 1. Updates host data from device (if needed) - * 2. Packs owned boundary items - * 3. Calls MPI_Neighbor_alltoallv (via comm_plan) - * 4. Unpacks into ghost items - * 5. Updates device with new ghost data - * - * Example usage: - * // Update owned nodes - * for (int i = 0; i < num_owned_nodes; i++) { - * node.coords(i, 0) += dt * velocity(i, 0); - * } - * - * // Sync ghosts - * node.coords.communicate(); - * - * // Now ghost data is current - */ - void communicate(); - - - /** - * @brief Non-blocking version: start ghost exchange - * - * For advanced users who want to overlap computation with communication. - * Must call communicate_wait() before accessing ghost data. - */ - void communicate_begin(); - - - /** - * @brief Wait for non-blocking ghost exchange to complete - */ - void communicate_wait(); - - - /** - * @brief Get number of owned items (excludes ghosts) - */ - KOKKOS_INLINE_FUNCTION - size_t num_owned() const { return num_owned_items_; } - - - /** - * @brief Get total items including ghosts - */ - KOKKOS_INLINE_FUNCTION - size_t num_total() const { return num_total_items_; } - - - /** - * @brief Check if ghost communication is configured - */ - bool has_communication_plan() const { return comm_plan_ != nullptr; } - - // These functions can setup the data needed for halo send/receives - // Not necessary for standard MPI comms - void mpi_setup(); - - void mpi_setup(int recv_rank); - - void mpi_setup(int recv_rank, int tag); - - void mpi_setup(int recv_rank, int tag, MPI_Comm comm); - - void mpi_set_rank(int recv_rank); - - void mpi_set_tag(int tag); - void mpi_set_comm(MPI_Comm comm); - - int get_rank(); - - int get_tag(); - - MPI_Comm get_comm(); KOKKOS_INLINE_FUNCTION T& operator()(size_t i) const; @@ -198,6 +99,13 @@ class MPIDArrayKokkos { KOKKOS_INLINE_FUNCTION MPIDArrayKokkos& operator=(const MPIDArrayKokkos& temp); + + // Method to set comm plan + KOKKOS_INLINE_FUNCTION + void initialize_comm_plan(CommunicationPlan* comm_plan); + + + // GPU Method // Method that returns size KOKKOS_INLINE_FUNCTION @@ -240,121 +148,205 @@ class MPIDArrayKokkos { }; // End of MPIDArrayKokkos -// // ============================================================================ -// // INLINE IMPLEMENTATIONS - DISTRIBUTED COMMUNICATION -// // ============================================================================ - -// /** -// * @brief Default constructor - initialize ghost communication members -// */ -// template -// KOKKOS_INLINE_FUNCTION -// MPIDArrayKokkos::MPIDArrayKokkos() -// : comm_plan_(nullptr), -// num_owned_items_(0), -// num_total_items_(0), -// num_fields_(0) -// { -// // Base constructor handles array initialization -// } - - -// /** -// * @brief Set communication plan and ghost metadata -// */ -// template -// inline void MPIDArrayKokkos::set_communication_plan( -// CommunicationPlan* plan, -// size_t num_owned, -// size_t num_total) -// { -// comm_plan_ = plan; -// num_owned_items_ = num_owned; -// num_total_items_ = num_total; - -// // Infer number of fields from array dimensions -// // Assumption: dim0 = num_items, dim1+ = fields -// if (order_ == 1) { -// num_fields_ = 1; // Scalar field -// } else if (order_ == 2) { -// num_fields_ = dims_[1]; // Vector field (e.g., coords[num_nodes, 3]) -// } else { -// // For higher order tensors, treat everything after dim0 as fields -// num_fields_ = 1; -// for (size_t i = 1; i < order_; i++) { -// num_fields_ *= dims_[i]; -// } -// } - -// // Validate dimensions match total items -// if (dims_[0] != num_total) { -// std::cerr << "Error: Array dim0 (" << dims_[0] << ") does not match num_total (" -// << num_total << ")" << std::endl; -// std::cerr << " Array must be allocated with size = num_owned + num_ghost" << std::endl; -// } -// } - - -// /** -// * @brief Synchronize ghost data using neighborhood collectives -// */ -// template -// inline void MPIDArrayKokkos::communicate() -// { -// if (!comm_plan_) { -// std::cerr << "Error: CommunicationPlan not set. Call set_communication_plan() first." << std::endl; -// return; -// } - -// if (!comm_plan_->has_graph_comm) { -// std::cerr << "Error: Graph communicator not initialized in CommunicationPlan." << std::endl; -// std::cerr << " Call comm_plan.create_graph_communicator() first." << std::endl; -// return; -// } - -// // 1. Update host from device (ensure data is current on CPU for MPI) -// this->update_host(); - -// // 2. Get raw pointer to data -// T* data_ptr = this->host_pointer(); - -// // 3. Convert to double* for MPI communication -// // TODO: Support other types (int, float, etc.) with template specialization -// static_assert(std::is_same::value, -// "Currently only double supported for ghost communication"); - -// double* double_ptr = reinterpret_cast(data_ptr); - -// // 4. Call neighborhood collective exchange -// comm_plan_->exchange_ghosts_neighborhood(double_ptr, static_cast(num_fields_)); - -// // 5. Update device with new ghost data -// this->update_device(); -// } - - -// /** -// * @brief Non-blocking version: start ghost exchange -// */ -// template -// inline void MPIDArrayKokkos::communicate_begin() -// { -// // TODO: Implement non-blocking version using Isend/Irecv -// // For now, just call blocking version -// std::cerr << "Warning: communicate_begin() not yet implemented, using blocking communicate()" << std::endl; -// communicate(); -// } - - -// /** -// * @brief Wait for non-blocking ghost exchange to complete -// */ -// template -// inline void MPIDArrayKokkos::communicate_wait() -// { -// // TODO: Implement non-blocking version -// // For now, this is a no-op since communicate_begin() is blocking -// } - - -// #endif // MPIDARRAYKOKKOS_H + +// Default constructor +template +MPICArrayKokkos::MPICArrayKokkos() + : this_array_() { } + +// Overloaded 1D constructor +template +MPICArrayKokkos::MPICArrayKokkos(size_t dim0, const std::string& tag_string) { + this_array_ = DCArrayKokkos(dim0, tag_string); + host = ViewCArray (this_array_.view_host().data(), dim0); +} + +// Overloaded 2D constructor +template +MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string) { + this_array_ = DCArrayKokkos(dim0, dim1, tag_string); + host = ViewCArray (this_array_.view_host().data(), dim0, dim1); +} + +// Overloaded 3D constructor +template +MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string) { + this_array_ = DCArrayKokkos(dim0, dim1, dim2, tag_string); + host = ViewCArray (this_array_.view_host().data(), dim0, dim1, dim2); +} + +// Overloaded 4D constructor +template +MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, const std::string& tag_string) { + this_array_ = DCArrayKokkos(dim0, dim1, dim2, dim3, tag_string); + host = ViewCArray (this_array_.view_host().data(), dim0, dim1, dim2, dim3); +} + +// Overloaded 5D constructor +template +MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, const std::string& tag_string) { + this_array_ = DCArrayKokkos(dim0, dim1, dim2, dim3, dim4, tag_string); + host = ViewCArray (this_array_.view_host().data(), dim0, dim1, dim2, dim3, dim4); +} + +// Overloaded 6D constructor +template +MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string) { + this_array_ = DCArrayKokkos(dim0, dim1, dim2, dim3, dim4, dim5, tag_string); + host = ViewCArray (this_array_.view_host().data(), dim0, dim1, dim2, dim3, dim4, dim5); +} + +// Overloaded 7D constructor +template +MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, size_t dim6, const std::string& tag_string) { + this_array_ = DCArrayKokkos(dim0, dim1, dim2, dim3, dim4, dim5, dim6, tag_string); + host = ViewCArray (this_array_.view_host().data(), dim0, dim1, dim2, dim3, dim4, dim5, dim6); +} + + +template +KOKKOS_INLINE_FUNCTION +T& MPICArrayKokkos::operator()(size_t i) const { + assert(order_ == 1 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 1D!"); + assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 1D!"); + return this_array_(i); +} + +template +KOKKOS_INLINE_FUNCTION +T& MPICArrayKokkos::operator()(size_t i, size_t j) const { + assert(order_ == 2 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 2D!"); + assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 2D!"); + assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 2D!"); + return this_array_(i, j); +} + +template +KOKKOS_INLINE_FUNCTION +T& MPICArrayKokkos::operator()(size_t i, size_t j, size_t k) const { + assert(order_ == 3 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 3D!"); + assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 3D!"); + assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 3D!"); + assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 3D!"); + return this_array_(i, j, k); +} + +template +KOKKOS_INLINE_FUNCTION +T& MPICArrayKokkos::operator()(size_t i, size_t j, size_t k, size_t l) const { + assert(order_ == 4 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 4D!"); + assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 4D!"); + assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 4D!"); + assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 4D!"); + assert(l < dims_[3] && "l is out of bounds in MPICArrayKokkos 4D!"); + return this_array_(i, j, k, l); +} + +template +KOKKOS_INLINE_FUNCTION +T& MPICArrayKokkos::operator()(size_t i, size_t j, size_t k, size_t l, size_t m) const { + assert(order_ == 5 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 5D!"); + assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 5D!"); + assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 5D!"); + assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 5D!"); + assert(l < dims_[3] && "l is out of bounds in MPICArrayKokkos 5D!"); + assert(m < dims_[4] && "m is out of bounds in MPICArrayKokkos 5D!"); + return this_array_(i, j, k, l, m); +} + +template +KOKKOS_INLINE_FUNCTION +T& MPICArrayKokkos::operator()(size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) const { + assert(order_ == 6 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 6D!"); + assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 6D!"); + assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 6D!"); + assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 6D!"); + assert(l < dims_[3] && "l is out of bounds in MPICArrayKokkos 6D!"); + assert(m < dims_[4] && "m is out of bounds in MPICArrayKokkos 6D!"); + assert(n < dims_[5] && "n is out of bounds in MPICArrayKokkos 6D!"); + return this_array_(i, j, k, l, m, n); +} + +template +KOKKOS_INLINE_FUNCTION +T& MPICArrayKokkos::operator()(size_t i, size_t j, size_t k, size_t l, size_t m, size_t n, size_t o) const { + assert(order_ == 7 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 7D!"); + assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 7D!"); + assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 7D!"); + assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 7D!"); + assert(l < dims_[3] && "l is out of bounds in MPICArrayKokkos 7D!"); + assert(m < dims_[4] && "m is out of bounds in MPICArrayKokkos 7D!"); + assert(n < dims_[5] && "n is out of bounds in MPICArrayKokkos 7D!"); + assert(o < dims_[6] && "o is out of bounds in MPICArrayKokkos 7D!"); + return this_array_(i, j, k, l, m, n, o); +} + +template +KOKKOS_INLINE_FUNCTION +MPIDArrayKokkos& MPICArrayKokkos::operator=(const MPIDArrayKokkos& temp) { + this_array_ = temp.this_array_; + return *this; +} + +// Return size +template +KOKKOS_INLINE_FUNCTION +size_t MPICArrayKokkos::size() const { + return this_array_.size(); +} + +template +KOKKOS_INLINE_FUNCTION +size_t MPICArrayKokkos::extent() const { + return this_array_.extent(); +} + +template +KOKKOS_INLINE_FUNCTION +size_t MPICArrayKokkos::dims(size_t i) const { + assert(i < order_ && "MPICArrayKokkos order (rank) does not match constructor, dim[i] does not exist!"); + assert(dims_[i]>0 && "Access to MPICArrayKokkos dims is out of bounds!"); + return this_array_.dims(i); +} + +template +KOKKOS_INLINE_FUNCTION +size_t MPICArrayKokkos::order() const { + return this_array_.order(); +} + +template +KOKKOS_INLINE_FUNCTION +T* MPICArrayKokkos::device_pointer() const { + return this_array_.device_pointer(); +} + +template +KOKKOS_INLINE_FUNCTION +T* MPICArrayKokkos::host_pointer() const { + return this_array_.host_pointer(); +} + +template +KOKKOS_INLINE_FUNCTION +Kokkos::DualView MPICArrayKokkos::get_kokkos_dual_view() const { + return this_array_.get_kokkos_dual_view(); +} + +template +void MPICArrayKokkos::update_host() { + this_array_.update_host(); +} + +template +void MPICArrayKokkos::update_device() { + this_array_.update_device(); +} + +template +KOKKOS_INLINE_FUNCTION +MPICArrayKokkos::~MPICArrayKokkos() { + this_array_.~DCArrayKokkos(); +} + +#endif \ No newline at end of file From c4fac4d2bf3398ef6692250cd798a21a26a6e178 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Wed, 5 Nov 2025 17:46:09 -0600 Subject: [PATCH 20/52] ENH: Handling comms via data structure --- examples/mesh_decomp/communication_plan.h | 19 +- examples/mesh_decomp/decomp_utils.h | 364 ++++++++-------------- examples/mesh_decomp/mesh_decomp.cpp | 2 +- examples/mesh_decomp/mpi_type.h | 201 +++++++++++- examples/mesh_decomp/state.h | 14 +- 5 files changed, 340 insertions(+), 260 deletions(-) diff --git a/examples/mesh_decomp/communication_plan.h b/examples/mesh_decomp/communication_plan.h index b49befb8..16904e57 100644 --- a/examples/mesh_decomp/communication_plan.h +++ b/examples/mesh_decomp/communication_plan.h @@ -1,3 +1,10 @@ +#ifndef COMMUNICATION_PLAN_H +#define COMMUNICATION_PLAN_H + +#include "matar.h" + +using namespace mtr; + /** * @struct CommunicationPlan * @brief Manages efficient MPI communication for ghost element and node data exchange @@ -69,7 +76,7 @@ int total_recv_count; - + // ======================================================================== // CONSTRUCTOR / INITIALIZATION @@ -103,7 +110,7 @@ this->num_send_ranks = num_send_ranks; this->num_recv_ranks = num_recv_ranks; - + this->send_rank_ids = DCArrayKokkos(num_send_ranks, "send_rank_ids"); for(int i = 0; i < num_send_ranks; i++){ this->send_rank_ids(i) = send_rank_ids[i]; @@ -114,7 +121,7 @@ for(int i = 0; i < num_recv_ranks; i++){ this->recv_rank_ids(i) = recv_rank_ids[i]; } - + MPI_Dist_graph_create_adjacent( mpi_comm_world, num_recv_ranks, @@ -234,8 +241,8 @@ void setup_send_recv(DRaggedRightArrayKokkos &rank_send_ids, DRaggedRightArrayKokkos &rank_recv_ids){ - this->send_indices_ = rank_send_ids; - this->recv_indices_ = rank_recv_ids; + this->send_indices_ = rank_send_ids; // ods of element data to send to each rank + this->recv_indices_ = rank_recv_ids; // // Setup send data @@ -300,3 +307,5 @@ } }; + +#endif // COMMUNICATION_PLAN_H \ No newline at end of file diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h index 2a256deb..49e1113e 100644 --- a/examples/mesh_decomp/decomp_utils.h +++ b/examples/mesh_decomp/decomp_utils.h @@ -1893,14 +1893,14 @@ void partition_mesh( final_node.coords.update_device(); -// -------------------------------------------------------------------------------------- + // -------------------------------------------------------------------------------------- // Build the send patterns for elements -// Build reverse map via global IDs: for each local element gid, find ranks that ghost it. -// Steps: -// 1) Each rank contributes its ghost element GIDs. -// 2) Allgatherv ghost GIDs to build gid -> [ranks that ghost it]. -// 3) For each locally-owned element gid, lookup ranks that ghost it and record targets. -// -------------------------------------------------------------------------------------- + // Build reverse map via global IDs: for each local element gid, find ranks that ghost it. + // Steps: + // 1) Each rank contributes its ghost element GIDs. + // 2) Allgatherv ghost GIDs to build gid -> [ranks that ghost it]. + // 3) For each locally-owned element gid, lookup ranks that ghost it and record targets. + // -------------------------------------------------------------------------------------- std::vector>> boundary_elem_targets(intermediate_mesh.num_elems); // Prepare local ghost list as vector @@ -1960,7 +1960,7 @@ void partition_mesh( std::cout.flush(); MPI_Barrier(MPI_COMM_WORLD); - + // Optional: print a compact summary of reverse map for verification (limited output) for(int i = 0; i < world_size; i++) { @@ -2169,7 +2169,8 @@ void partition_mesh( for (size_t i = 0; i < ghost_elem_owner_ranks.size(); i++) { int source_rank = ghost_elem_owner_ranks[i]; - elems_to_recv_by_rank[source_rank].push_back(static_cast(i)); + int ghost_elem_local_id = final_mesh.num_owned_elems + i; + elems_to_recv_by_rank[source_rank].push_back(ghost_elem_local_id); } // ========== Serialize into a DRaggedRightArrayKokkos ========== @@ -2177,6 +2178,7 @@ void partition_mesh( for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) { int source_rank = element_communication_plan.recv_rank_ids.host(i); elem_recv_strides_array(i) = elems_to_recv_by_rank[source_rank].size(); + } DRaggedRightArrayKokkos elems_to_recv_by_rank_rr(elem_recv_strides_array, "elems_to_recv_by_rank"); // Fill in the data @@ -2188,6 +2190,21 @@ void partition_mesh( } elems_to_recv_by_rank_rr.update_device(); + // Debug: Print send vs recv counts per neighbor to diagnose mismatch + if (print_info) { + std::cout << "[rank " << rank << "] Send/Recv count comparison:" << std::endl; + for (int i = 0; i < element_communication_plan.num_send_ranks; i++) { + int dest_rank = element_communication_plan.send_rank_ids.host(i); + int send_count = elems_to_send_by_rank_rr.stride_host(i); + std::cout << " To rank " << dest_rank << ": sending " << send_count << " elements" << std::endl; + } + for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) { + int src_rank = element_communication_plan.recv_rank_ids.host(i); + int recv_count = elems_to_recv_by_rank_rr.stride_host(i); + std::cout << " From rank " << src_rank << ": expecting " << recv_count << " elements" << std::endl; + } + } + element_communication_plan.setup_send_recv(elems_to_send_by_rank_rr, elems_to_recv_by_rank_rr); MPI_Barrier(MPI_COMM_WORLD); @@ -2200,13 +2217,12 @@ void partition_mesh( // Gauss points share the same communication plan as elements. // This test initializes gauss point fields on owned elements and exchanges them with ghost elements. - print_info = true; // Enable debug output for communication test - - gauss_point.initialize(final_mesh.num_elems, 1, {gauss_pt_state::fields}); // , &element_communication_plan + gauss_point.initialize(final_mesh.num_elems, 1, {gauss_pt_state::fields}, element_communication_plan); // , &element_communication_plan // Initialize the gauss point fields on each rank // Set owned elements to rank number, ghost elements to -1 (to verify communication) for (int i = 0; i < final_mesh.num_owned_elems; i++) { + // if(rank == 0) std::cout << " Setting owned element " << i << " to rank " << rank << std::endl; gauss_point.fields.host(i) = static_cast(rank); } for (int i = final_mesh.num_owned_elems; i < final_mesh.num_elems; i++) { @@ -2214,136 +2230,8 @@ void partition_mesh( } gauss_point.fields.update_device(); - - - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout << " Starting to build the send buffer for element communication" << std::endl; - MPI_Barrier(MPI_COMM_WORLD); - - // ========== Build send buffer organized by destination rank ========== - std::vector elem_send_buffer(element_communication_plan.total_send_count); - int send_idx = 0; - - for (int i = 0; i < element_communication_plan.num_send_ranks; i++) { - // Get the number of elements to send to this neighbor - size_t num_elems_to_send = elems_to_send_by_rank_rr.stride_host(i); - if(rank == 0) std::cout << " Sending " << num_elems_to_send << " elements to rank " << element_communication_plan.send_rank_ids.host(i) << std::endl; - - for (size_t j = 0; j < num_elems_to_send; j++) { - int elem_lid = elems_to_send_by_rank_rr.host(i, j); - if(rank == 0) std::cout << " Sending element " << elem_lid << std::endl; - double value = gauss_point.fields.host(elem_lid); - - if(rank == 0) std::cout << " Value: " << value << std::endl; - elem_send_buffer[send_idx++] = value; - } - } - - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout << " Finished building the send buffer for element communication" << std::endl; - MPI_Barrier(MPI_COMM_WORLD); - - // ========== Allocate receive buffer ========== - std::vector elem_recv_buffer(element_communication_plan.total_recv_count); - - - - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout << " Finished building the receive buffer for element communication" << std::endl; - MPI_Barrier(MPI_COMM_WORLD); - - - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout << " Starting to exchange element data using MPI_Neighbor_alltoallv" << std::endl; - MPI_Barrier(MPI_COMM_WORLD); - - - - MPI_Neighbor_alltoallv( - elem_send_buffer.data(), // Send buffer with boundary element data - element_communication_plan.send_counts_.host_pointer(), // Number of elements to send to each outgoing neighbor [outdegree] - element_communication_plan.send_displs_.host_pointer(), // Displacement in send buffer for each outgoing neighbor [outdegree] - MPI_DOUBLE, // Send data type - elem_recv_buffer.data(), // Receive buffer for ghost element data - element_communication_plan.recv_counts_.host_pointer(), // Number of elements to receive from each incoming neighbor [indegree] - element_communication_plan.recv_displs_.host_pointer(), // Displacement in recv buffer for each incoming neighbor [indegree] - MPI_DOUBLE, // Receive data type - element_communication_plan.mpi_comm_graph // Distributed graph communicator - ); - - // ========== Update ghost element fields from receive buffer ========== - // Unpack received data back into ghost elements in the correct order - - // Track which ghost elements have been updated for debugging - std::vector ghost_updated(final_mesh.num_ghost_elems, false); - - int recv_idx = 0; - for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) { - // Get the number of ghost elements from this source rank - size_t num_ghosts_from_source = elems_to_recv_by_rank_rr.stride_host(i); - - for (size_t j = 0; j < num_ghosts_from_source; j++) { - int ghost_idx = elems_to_recv_by_rank_rr.host(i, j); - int ghost_elem_local_id = final_mesh.num_owned_elems + ghost_idx; - gauss_point.fields.host(ghost_elem_local_id) = elem_recv_buffer[recv_idx++]; - ghost_updated[ghost_idx] = true; - } - } + gauss_point.fields.communicate(); - // Debug: Check which ghosts weren't updated - if (print_info) { - std::vector missing_ghosts; - for (size_t i = 0; i < ghost_updated.size(); i++) { - if (!ghost_updated[i]) { - missing_ghosts.push_back(static_cast(i)); - } - } - - if (!missing_ghosts.empty()) { - for (int r = 0; r < world_size; ++r) { - MPI_Barrier(MPI_COMM_WORLD); - if (rank == r) { - std::cout << "[rank " << rank << "] WARNING: " << missing_ghosts.size() - << " ghost elements not in elems_to_recv_by_rank: "; - for (size_t i = 0; i < std::min(missing_ghosts.size(), size_t(10)); i++) { - std::cout << missing_ghosts[i] << " "; - } - if (missing_ghosts.size() > 10) std::cout << "..."; - std::cout << std::endl; - } - MPI_Barrier(MPI_COMM_WORLD); - } - } - } - - gauss_point.fields.update_device(); - - // ========== Verify the communication worked correctly ========== - bool comm_test_passed = true; - for (int i = final_mesh.num_owned_elems; i < final_mesh.num_elems; i++) { - if (gauss_point.fields.host(i) < 0.0) { - std::cerr << "[rank " << rank << "] ERROR: Ghost element " << i - << " was not updated (value = " << gauss_point.fields.host(i) << ")" << std::endl; - comm_test_passed = false; - } - } - - int local_test_passed = comm_test_passed ? 1 : 0; - int global_test_passed = 0; - MPI_Allreduce(&local_test_passed, &global_test_passed, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); - - MPI_Barrier(MPI_COMM_WORLD); - if (rank == 0) { - if (global_test_passed) { - std::cout << "\n✓ Element communication test PASSED on all ranks\n" << std::endl; - } else { - std::cout << "\n✗ Element communication test FAILED on one or more ranks\n" << std::endl; - } - } - MPI_Barrier(MPI_COMM_WORLD); - - print_info = false; // Disable debug output after communication test - // Loop over all elements and average the values of elements connected to that element for (int i = 0; i < final_mesh.num_elems; i++) { double value = 0.0; @@ -2355,112 +2243,112 @@ void partition_mesh( } gauss_point.fields.update_device(); - -// -------------------------------------------------------------------------------------- + + // -------------------------------------------------------------------------------------- // Build the send pattern for nodes -// Build reverse map via global IDs: for each local node gid, find ranks that ghost it. -// Steps: -// 1) Each rank contributes its ghost node GIDs. -// 2) Allgatherv ghost node GIDs to build gid -> [ranks that ghost it]. -// 3) For each locally-owned node gid, lookup ranks that ghost it and record targets. -// -------------------------------------------------------------------------------------- - - std::vector>> boundary_node_targets(intermediate_mesh.num_nodes); - - // Prepare local ghost node list as vector - std::vector ghost_node_gids_vec; - ghost_node_gids_vec.reserve(ghost_only_nodes.size()); - for (const auto &g : ghost_only_nodes) ghost_node_gids_vec.push_back(g); - - // Exchange counts - std::vector ghost_node_counts(world_size, 0); - int local_ghost_node_count = static_cast(ghost_node_gids_vec.size()); - MPI_Allgather(&local_ghost_node_count, 1, MPI_INT, ghost_node_counts.data(), 1, MPI_INT, MPI_COMM_WORLD); - - // Displacements and recv buffer - std::vector ghost_node_displs(world_size, 0); - int total_ghost_nodes = 0; - for (int r = 0; r < world_size; ++r) { - ghost_node_displs[r] = total_ghost_nodes; - total_ghost_nodes += ghost_node_counts[r]; - } - std::vector all_ghost_node_gids(total_ghost_nodes); - - // Gather ghost node gids - MPI_Allgatherv(ghost_node_gids_vec.data(), local_ghost_node_count, MPI_UNSIGNED_LONG_LONG, - all_ghost_node_gids.data(), ghost_node_counts.data(), ghost_node_displs.data(), - MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); - - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout << " Finished gathering ghost node GIDs" << std::endl; - - - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout << " Starting to build the reverse map for node communication" << std::endl; - - // Build map node_gid -> ranks that ghost it - std::unordered_map> node_gid_to_ghosting_ranks; - node_gid_to_ghosting_ranks.reserve(static_cast(total_ghost_nodes)); - for (int r = 0; r < world_size; ++r) { - int cnt = ghost_node_counts[r]; - int off = ghost_node_displs[r]; - for (int i = 0; i < cnt; ++i) { - size_t g = all_ghost_node_gids[off + i]; - node_gid_to_ghosting_ranks[g].push_back(r); - } - } + // Build reverse map via global IDs: for each local node gid, find ranks that ghost it. + // Steps: + // 1) Each rank contributes its ghost node GIDs. + // 2) Allgatherv ghost node GIDs to build gid -> [ranks that ghost it]. + // 3) For each locally-owned node gid, lookup ranks that ghost it and record targets. + // -------------------------------------------------------------------------------------- + + // std::vector>> boundary_node_targets(intermediate_mesh.num_nodes); + + // // Prepare local ghost node list as vector + // std::vector ghost_node_gids_vec; + // ghost_node_gids_vec.reserve(ghost_only_nodes.size()); + // for (const auto &g : ghost_only_nodes) ghost_node_gids_vec.push_back(g); + + // // Exchange counts + // std::vector ghost_node_counts(world_size, 0); + // int local_ghost_node_count = static_cast(ghost_node_gids_vec.size()); + // MPI_Allgather(&local_ghost_node_count, 1, MPI_INT, ghost_node_counts.data(), 1, MPI_INT, MPI_COMM_WORLD); + + // // Displacements and recv buffer + // std::vector ghost_node_displs(world_size, 0); + // int total_ghost_nodes = 0; + // for (int r = 0; r < world_size; ++r) { + // ghost_node_displs[r] = total_ghost_nodes; + // total_ghost_nodes += ghost_node_counts[r]; + // } + // std::vector all_ghost_node_gids(total_ghost_nodes); + + // // Gather ghost node gids + // MPI_Allgatherv(ghost_node_gids_vec.data(), local_ghost_node_count, MPI_UNSIGNED_LONG_LONG, + // all_ghost_node_gids.data(), ghost_node_counts.data(), ghost_node_displs.data(), + // MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); + + // MPI_Barrier(MPI_COMM_WORLD); + // if(rank == 0) std::cout << " Finished gathering ghost node GIDs" << std::endl; + + + // MPI_Barrier(MPI_COMM_WORLD); + // if(rank == 0) std::cout << " Starting to build the reverse map for node communication" << std::endl; + + // // Build map node_gid -> ranks that ghost it + // std::unordered_map> node_gid_to_ghosting_ranks; + // node_gid_to_ghosting_ranks.reserve(static_cast(total_ghost_nodes)); + // for (int r = 0; r < world_size; ++r) { + // int cnt = ghost_node_counts[r]; + // int off = ghost_node_displs[r]; + // for (int i = 0; i < cnt; ++i) { + // size_t g = all_ghost_node_gids[off + i]; + // node_gid_to_ghosting_ranks[g].push_back(r); + // } + // } - // For each local node, list destinations: ranks that ghost our node gid - for (int node_lid = 0; node_lid < intermediate_mesh.num_nodes; node_lid++) { - size_t local_node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid); - auto it = node_gid_to_ghosting_ranks.find(local_node_gid); - if (it == node_gid_to_ghosting_ranks.end()) continue; - const std::vector &dest_ranks = it->second; - for (int rr : dest_ranks) { - if (rr == rank) continue; - boundary_node_targets[node_lid].push_back(std::make_pair(rr, local_node_gid)); - } - } + // // For each local node, list destinations: ranks that ghost our node gid + // for (int node_lid = 0; node_lid < intermediate_mesh.num_nodes; node_lid++) { + // size_t local_node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid); + // auto it = node_gid_to_ghosting_ranks.find(local_node_gid); + // if (it == node_gid_to_ghosting_ranks.end()) continue; + // const std::vector &dest_ranks = it->second; + // for (int rr : dest_ranks) { + // if (rr == rank) continue; + // boundary_node_targets[node_lid].push_back(std::make_pair(rr, local_node_gid)); + // } + // } - std::cout.flush(); - MPI_Barrier(MPI_COMM_WORLD); - print_info = false; + // std::cout.flush(); + // MPI_Barrier(MPI_COMM_WORLD); + // print_info = false; - // Optional: print a compact summary of node reverse map for verification (limited output) - for(int i = 0; i < world_size; i++) { - if (rank == i && print_info) { - std::cout << std::endl; - for (int node_lid = 0; node_lid < intermediate_mesh.num_nodes; node_lid++) { + // // Optional: print a compact summary of node reverse map for verification (limited output) + // for(int i = 0; i < world_size; i++) { + // if (rank == i && print_info) { + // std::cout << std::endl; + // for (int node_lid = 0; node_lid < intermediate_mesh.num_nodes; node_lid++) { - size_t local_node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid); - if (boundary_node_targets[node_lid].empty()) - { - std::cout << "[rank " << rank << "] " << "node_lid: "<< node_lid <<" - node_gid: " << local_node_gid << " sends to: no ghost nodes" << std::endl; - } - else - { - std::cout << "[rank " << rank << "] " << "node_lid: "<< node_lid <<" - node_gid: " << local_node_gid << " sends to: "; - int shown = 0; - for (const auto &pr : boundary_node_targets[node_lid]) { - if (shown >= 12) { std::cout << " ..."; break; } - std::cout << "(r" << pr.first << ":gid " << pr.second << ") "; - shown++; - } - std::cout << std::endl; - } - } - std::cout.flush(); - } - MPI_Barrier(MPI_COMM_WORLD); - } + // size_t local_node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid); + // if (boundary_node_targets[node_lid].empty()) + // { + // std::cout << "[rank " << rank << "] " << "node_lid: "<< node_lid <<" - node_gid: " << local_node_gid << " sends to: no ghost nodes" << std::endl; + // } + // else + // { + // std::cout << "[rank " << rank << "] " << "node_lid: "<< node_lid <<" - node_gid: " << local_node_gid << " sends to: "; + // int shown = 0; + // for (const auto &pr : boundary_node_targets[node_lid]) { + // if (shown >= 12) { std::cout << " ..."; break; } + // std::cout << "(r" << pr.first << ":gid " << pr.second << ") "; + // shown++; + // } + // std::cout << std::endl; + // } + // } + // std::cout.flush(); + // } + // MPI_Barrier(MPI_COMM_WORLD); + // } - print_info = false; + // print_info = false; - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout << " Finished building node communication reverse map" << std::endl; + // MPI_Barrier(MPI_COMM_WORLD); + // if(rank == 0) std::cout << " Finished building node communication reverse map" << std::endl; + - } diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp index e1383ccb..b14ee9cd 100644 --- a/examples/mesh_decomp/mesh_decomp.cpp +++ b/examples/mesh_decomp/mesh_decomp.cpp @@ -34,7 +34,7 @@ int main(int argc, char** argv) { // Mesh size double origin[3] = {0.0, 0.0, 0.0}; double length[3] = {1.0, 1.0, 1.0}; - int num_elems_dim[3] = {20, 20, 20}; + int num_elems_dim[3] = {50, 50, 50}; // Initial mesh built on rank zero Mesh_t initial_mesh; diff --git a/examples/mesh_decomp/mpi_type.h b/examples/mesh_decomp/mpi_type.h index 636a1bd0..54766d31 100644 --- a/examples/mesh_decomp/mpi_type.h +++ b/examples/mesh_decomp/mpi_type.h @@ -15,6 +15,9 @@ class MPICArrayKokkos { // Dual view for managing data on both CPU and GPU DCArrayKokkos this_array_; + + DCArrayKokkos send_buffer_; + DCArrayKokkos recv_buffer_; protected: size_t dims_[7]; @@ -97,12 +100,16 @@ class MPICArrayKokkos { size_t n, size_t o) const; KOKKOS_INLINE_FUNCTION - MPIDArrayKokkos& operator=(const MPIDArrayKokkos& temp); + MPICArrayKokkos& operator=(const MPICArrayKokkos& temp); // Method to set comm plan KOKKOS_INLINE_FUNCTION - void initialize_comm_plan(CommunicationPlan* comm_plan); + void initialize_comm_plan(CommunicationPlan& comm_plan){ + comm_plan_ = &comm_plan; + send_buffer_ = DCArrayKokkos(comm_plan_->total_send_count, "send_buffer"); + recv_buffer_ = DCArrayKokkos(comm_plan_->total_recv_count, "recv_buffer"); + }; @@ -132,7 +139,7 @@ class MPICArrayKokkos { // Method returns kokkos dual view KOKKOS_INLINE_FUNCTION - TArray1D get_kokkos_dual_view() const; + Kokkos::DualView get_kokkos_dual_view() const; // Method that update host view void update_host(); @@ -140,11 +147,174 @@ class MPICArrayKokkos { // Method that update device view void update_device(); + // Method that builds the send buffer + void fill_send_buffer(){ + + int rank; + MPI_Comm_rank(comm_plan_->mpi_comm_world, &rank); + + // this_array_.update_host(); + int send_idx = 0; + for(int i = 0; i < comm_plan_->num_send_ranks; i++){ + for(int j = 0; j < comm_plan_->send_counts_.host(i); j++){ + int src_idx = comm_plan_->send_indices_.host(i, j); + send_buffer_.host(send_idx) = this_array_.host(src_idx); + if(rank == 0) std::cout << "MPICArrayKokkos::fill_send_buffer() - send_buffer(" << send_idx << ") = " << this_array_.host(src_idx) << std::endl; + send_idx++; + } + } + }; + + // Method that copies the recv buffer + void copy_recv_buffer(){ + int rank; + MPI_Comm_rank(comm_plan_->mpi_comm_world, &rank); + + // NOTE: Do NOT call recv_buffer_.update_host() here! + // MPI already wrote directly to host memory, so calling update_host() + // would overwrite the received data by copying stale device data + int recv_idx = 0; + for(int i = 0; i < comm_plan_->num_recv_ranks; i++){ + for(int j = 0; j < comm_plan_->recv_counts_.host(i); j++){ + int dest_idx = comm_plan_->recv_indices_.host(i, j); + this_array_.host(dest_idx) = recv_buffer_.host(recv_idx); + //if(rank == 0) std::cout << "MPICArrayKokkos::copy_recv_buffer() - this_array(" << dest_idx << ") = " << recv_buffer_.host(recv_idx) << std::endl; + recv_idx++; + } + } + }; + + void communicate(){ + int rank; + MPI_Comm_rank(comm_plan_->mpi_comm_world, &rank); + + if(rank == 0) { + std::cout << "MPICArrayKokkos::communicate() - this_array size: " << this_array_.size() << std::endl; + std::cout << "MPICArrayKokkos::communicate() - send_buffer size: " << send_buffer_.size() + << ", recv_buffer size: " << recv_buffer_.size() << std::endl; + std::cout << "MPICArrayKokkos::communicate() - total_send_count: " << comm_plan_->total_send_count + << ", total_recv_count: " << comm_plan_->total_recv_count << std::endl; + } + + fill_send_buffer(); + + if(rank == 0) std::cout << "MPICArrayKokkos::communicate() - Starting MPI_Neighbor_alltoallv" << std::endl; + + + MPI_Barrier(comm_plan_->mpi_comm_world); + + // Verify buffer sizes match expected + if(rank == 0) { + std::cout << "Send buffer size check: " << send_buffer_.size() << " vs expected " << comm_plan_->total_send_count << std::endl; + std::cout << "Recv buffer size check: " << recv_buffer_.size() << " vs expected " << comm_plan_->total_recv_count << std::endl; + + // Print first few send values + std::cout << "MPICArrayKokkos::communicate() - send_buffer values: "; + for(int i = 0; i < 10 && i < send_buffer_.size(); i++) { + std::cout << send_buffer_.host(i) << " "; + } + std::cout << std::endl; + + // Print send counts and displs + std::cout << "Send counts: "; + int total_send = 0; + for(int i = 0; i < comm_plan_->num_send_ranks; i++) { + int count = comm_plan_->send_counts_.host(i); + std::cout << count << " "; + total_send += count; + } + std::cout << "(total=" << total_send << ")" << std::endl; + + std::cout << "Send displs: "; + for(int i = 0; i < comm_plan_->num_send_ranks; i++) { + std::cout << comm_plan_->send_displs_.host(i) << " "; + } + std::cout << std::endl; + + // Print recv counts and displs + std::cout << "Recv counts: "; + int total_recv = 0; + for(int i = 0; i < comm_plan_->num_recv_ranks; i++) { + int count = comm_plan_->recv_counts_.host(i); + std::cout << count << " "; + total_recv += count; + } + std::cout << "(total=" << total_recv << ")" << std::endl; + + std::cout << "Recv displs: "; + for(int i = 0; i < comm_plan_->num_recv_ranks; i++) { + std::cout << comm_plan_->recv_displs_.host(i) << " "; + } + std::cout << std::endl; + } + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout << "MPICArrayKokkos::communicate() calling MPI_Neighbor_alltoallv"< 0) ? &send_buffer_.host(0) : nullptr; + T* recv_buf_ptr = (recv_buffer_.size() > 0) ? &recv_buffer_.host(0) : nullptr; + int* send_cnt_ptr = (comm_plan_->num_send_ranks > 0) ? &comm_plan_->send_counts_.host(0) : nullptr; + int* send_dsp_ptr = (comm_plan_->num_send_ranks > 0) ? &comm_plan_->send_displs_.host(0) : nullptr; + int* recv_cnt_ptr = (comm_plan_->num_recv_ranks > 0) ? &comm_plan_->recv_counts_.host(0) : nullptr; + int* recv_dsp_ptr = (comm_plan_->num_recv_ranks > 0) ? &comm_plan_->recv_displs_.host(0) : nullptr; + + if(rank == 0) { + std::cout << "Pointer addresses:" << std::endl; + std::cout << " send_buf_ptr = " << (void*)send_buf_ptr << std::endl; + std::cout << " send_cnt_ptr = " << (void*)send_cnt_ptr << std::endl; + std::cout << " send_dsp_ptr = " << (void*)send_dsp_ptr << std::endl; + std::cout << " recv_buf_ptr = " << (void*)recv_buf_ptr << std::endl; + std::cout << " recv_cnt_ptr = " << (void*)recv_cnt_ptr << std::endl; + std::cout << " recv_dsp_ptr = " << (void*)recv_dsp_ptr << std::endl; + } + MPI_Barrier(MPI_COMM_WORLD); + + MPI_Neighbor_alltoallv( + &send_buffer_.host(0), + &comm_plan_->send_counts_.host(0), + &comm_plan_->send_displs_.host(0), + MPI_DOUBLE, + &recv_buffer_.host(0), + &comm_plan_->recv_counts_.host(0), + &comm_plan_->recv_displs_.host(0), + MPI_DOUBLE, + comm_plan_->mpi_comm_graph); + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout << "MPICArrayKokkos::communicate() finished MPI_Neighbor_alltoallv"<::MPICArrayKokkos() template MPICArrayKokkos::MPICArrayKokkos(size_t dim0, const std::string& tag_string) { this_array_ = DCArrayKokkos(dim0, tag_string); - host = ViewCArray (this_array_.view_host().data(), dim0); + host = ViewCArray (this_array_.host_pointer(), dim0); } // Overloaded 2D constructor template MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string) { this_array_ = DCArrayKokkos(dim0, dim1, tag_string); - host = ViewCArray (this_array_.view_host().data(), dim0, dim1); + host = ViewCArray (this_array_.host_pointer(), dim0, dim1); } // Overloaded 3D constructor template MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string) { this_array_ = DCArrayKokkos(dim0, dim1, dim2, tag_string); - host = ViewCArray (this_array_.view_host().data(), dim0, dim1, dim2); + host = ViewCArray (this_array_.host_pointer(), dim0, dim1, dim2); } // Overloaded 4D constructor template MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, const std::string& tag_string) { this_array_ = DCArrayKokkos(dim0, dim1, dim2, dim3, tag_string); - host = ViewCArray (this_array_.view_host().data(), dim0, dim1, dim2, dim3); + host = ViewCArray (this_array_.host_pointer(), dim0, dim1, dim2, dim3); } // Overloaded 5D constructor template MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, const std::string& tag_string) { this_array_ = DCArrayKokkos(dim0, dim1, dim2, dim3, dim4, tag_string); - host = ViewCArray (this_array_.view_host().data(), dim0, dim1, dim2, dim3, dim4); + host = ViewCArray (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4); } // Overloaded 6D constructor template MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string) { this_array_ = DCArrayKokkos(dim0, dim1, dim2, dim3, dim4, dim5, tag_string); - host = ViewCArray (this_array_.view_host().data(), dim0, dim1, dim2, dim3, dim4, dim5); + host = ViewCArray (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4, dim5); } // Overloaded 7D constructor template MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, size_t dim6, const std::string& tag_string) { this_array_ = DCArrayKokkos(dim0, dim1, dim2, dim3, dim4, dim5, dim6, tag_string); - host = ViewCArray (this_array_.view_host().data(), dim0, dim1, dim2, dim3, dim4, dim5, dim6); + host = ViewCArray (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4, dim5, dim6); } @@ -283,8 +453,12 @@ T& MPICArrayKokkos::operator()(size_t i, size_t template KOKKOS_INLINE_FUNCTION -MPIDArrayKokkos& MPICArrayKokkos::operator=(const MPIDArrayKokkos& temp) { +MPICArrayKokkos& MPICArrayKokkos::operator=(const MPICArrayKokkos& temp) { this_array_ = temp.this_array_; + host = temp.host; // Also copy the host ViewCArray + comm_plan_ = temp.comm_plan_; + send_buffer_ = temp.send_buffer_; + recv_buffer_ = temp.recv_buffer_; return *this; } @@ -346,7 +520,8 @@ void MPICArrayKokkos::update_device() { template KOKKOS_INLINE_FUNCTION MPICArrayKokkos::~MPICArrayKokkos() { - this_array_.~DCArrayKokkos(); + // Member variables (this_array_, send_buffer_, recv_buffer_) are automatically + // destroyed by the compiler - no explicit cleanup needed } #endif \ No newline at end of file diff --git a/examples/mesh_decomp/state.h b/examples/mesh_decomp/state.h index 01f54624..b1ad58a4 100644 --- a/examples/mesh_decomp/state.h +++ b/examples/mesh_decomp/state.h @@ -35,6 +35,7 @@ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STATE_H #include "matar.h" +#include "mpi_type.h" using namespace mtr; @@ -95,17 +96,24 @@ enum class gauss_pt_state struct GaussPoint_t { - DCArrayKokkos fields; ///< GaussPoint fields + //DCArrayKokkos fields; ///< GaussPoint fields + + + MPICArrayKokkos fields; // initialization method (num_cells, num_dims) - void initialize(size_t num_gauss_pnts, size_t num_dims, std::vector gauss_pt_states) + void initialize(size_t num_gauss_pnts, size_t num_dims, std::vector gauss_pt_states, CommunicationPlan& comm_plan) { for (auto field : gauss_pt_states){ switch(field){ case gauss_pt_state::fields: - if (fields.size() == 0) this->fields = DCArrayKokkos(num_gauss_pnts, "gauss_point_fields"); + //if (fields.size() == 0) this->fields = DCArrayKokkos(num_gauss_pnts, "gauss_point_fields"); + if (fields.size() == 0){ + this->fields = MPICArrayKokkos(num_gauss_pnts, "gauss_point_fields"); + this->fields.initialize_comm_plan(comm_plan); + } break; default: std::cout<<"Desired gauss point state not understood in GaussPoint_t initialize"< Date: Thu, 6 Nov 2025 15:56:37 -0600 Subject: [PATCH 21/52] ENH: Testing multi-dimensional MPICArrayKokkos types, working --- examples/mesh_decomp/decomp_utils.h | 54 +++-- examples/mesh_decomp/mesh_io.h | 32 ++- examples/mesh_decomp/mpi_type.h | 321 ++++++++++++++-------------- examples/mesh_decomp/state.h | 12 +- 4 files changed, 244 insertions(+), 175 deletions(-) diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h index 49e1113e..e3421259 100644 --- a/examples/mesh_decomp/decomp_utils.h +++ b/examples/mesh_decomp/decomp_utils.h @@ -1812,9 +1812,9 @@ void partition_mesh( } // 2. Build owned node GIDs and their coordinates - std::vector owned_gids(intermediate_mesh.num_nodes); - for (int i = 0; i < owned_gids.size(); ++i) - owned_gids[i] = intermediate_mesh.local_to_global_node_mapping.host(i); + std::vector owned_gids(final_mesh.num_owned_nodes); + for (int i = 0; i < final_mesh.num_owned_nodes; ++i) + owned_gids[i] = final_mesh.local_to_global_node_mapping.host(i); // 3. Gather all GIDs in the world that are needed anywhere (owned or ghosted, by any rank) // so we can distribute the needed coordinate data. @@ -1901,16 +1901,18 @@ void partition_mesh( // 2) Allgatherv ghost GIDs to build gid -> [ranks that ghost it]. // 3) For each locally-owned element gid, lookup ranks that ghost it and record targets. // -------------------------------------------------------------------------------------- - std::vector>> boundary_elem_targets(intermediate_mesh.num_elems); + std::vector>> boundary_elem_targets(final_mesh.num_owned_elems); // Prepare local ghost list as vector std::vector ghost_gids_vec; - ghost_gids_vec.reserve(ghost_elem_gids.size()); - for (const auto &g : ghost_elem_gids) ghost_gids_vec.push_back(g); + ghost_gids_vec.reserve(final_mesh.num_ghost_elems); + for (int i = 0; i < final_mesh.num_ghost_elems; ++i) { + ghost_gids_vec.push_back(final_mesh.local_to_global_elem_mapping.host(final_mesh.num_owned_elems + i)); // Ghost elements are after the owned elements in the global element mapping + } // Exchange counts std::vector ghost_counts(world_size, 0); - int local_ghost_count = static_cast(ghost_gids_vec.size()); + int local_ghost_count = final_mesh.num_ghost_elems; MPI_Allgather(&local_ghost_count, 1, MPI_INT, ghost_counts.data(), 1, MPI_INT, MPI_COMM_WORLD); // Displacements and recv buffer @@ -1947,8 +1949,8 @@ void partition_mesh( } // For each local element, list destinations: ranks that ghost our gid - for (int elem_lid = 0; elem_lid < intermediate_mesh.num_elems; elem_lid++) { - size_t local_elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(elem_lid); + for (int elem_lid = 0; elem_lid < final_mesh.num_owned_elems; elem_lid++) { + size_t local_elem_gid = final_mesh.local_to_global_elem_mapping.host(elem_lid); auto it = gid_to_ghosting_ranks.find(local_elem_gid); if (it == gid_to_ghosting_ranks.end()) continue; const std::vector &dest_ranks = it->second; @@ -1966,9 +1968,9 @@ void partition_mesh( for(int i = 0; i < world_size; i++) { if (rank == i && print_info) { std::cout << std::endl; - for (int elem_lid = 0; elem_lid < intermediate_mesh.num_elems; elem_lid++) { + for (int elem_lid = 0; elem_lid < final_mesh.num_owned_elems; elem_lid++) { - size_t local_elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(elem_lid); + size_t local_elem_gid = final_mesh.local_to_global_elem_mapping.host(elem_lid); if (boundary_elem_targets[elem_lid].empty()) { std::cout << "[rank " << rank << "] " << "elem_lid: "<< elem_lid <<" - elem_gid: " << local_elem_gid << " sends to: no ghost elements" << std::endl; @@ -1997,9 +1999,9 @@ void partition_mesh( std::set ghost_comm_ranks; // set of ranks that this rank communicates with - for (int elem_lid = 0; elem_lid < intermediate_mesh.num_elems; elem_lid++) { + for (int elem_lid = 0; elem_lid < final_mesh.num_owned_elems; elem_lid++) { - int local_elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(elem_lid); + int local_elem_gid = final_mesh.local_to_global_elem_mapping.host(elem_lid); if (boundary_elem_targets[elem_lid].empty()) { continue; @@ -2217,20 +2219,30 @@ void partition_mesh( // Gauss points share the same communication plan as elements. // This test initializes gauss point fields on owned elements and exchanges them with ghost elements. - gauss_point.initialize(final_mesh.num_elems, 1, {gauss_pt_state::fields}, element_communication_plan); // , &element_communication_plan + std::vector gauss_pt_states = {gauss_pt_state::fields, gauss_pt_state::fields_vec}; + + gauss_point.initialize(final_mesh.num_elems, final_mesh.num_dims, gauss_pt_states, element_communication_plan); // , &element_communication_plan // Initialize the gauss point fields on each rank // Set owned elements to rank number, ghost elements to -1 (to verify communication) for (int i = 0; i < final_mesh.num_owned_elems; i++) { // if(rank == 0) std::cout << " Setting owned element " << i << " to rank " << rank << std::endl; gauss_point.fields.host(i) = static_cast(rank); + gauss_point.fields_vec.host(i, 0) = static_cast(rank); + gauss_point.fields_vec.host(i, 1) = static_cast(rank); + gauss_point.fields_vec.host(i, 2) = static_cast(rank); } for (int i = final_mesh.num_owned_elems; i < final_mesh.num_elems; i++) { gauss_point.fields.host(i) = -1.0; // Ghost elements should be updated + gauss_point.fields_vec.host(i, 0) = -1.0; + gauss_point.fields_vec.host(i, 1) = -1.0; + gauss_point.fields_vec.host(i, 2) = -1.0; } gauss_point.fields.update_device(); - + gauss_point.fields_vec.update_device(); + gauss_point.fields.communicate(); + gauss_point.fields_vec.communicate(); // Loop over all elements and average the values of elements connected to that element for (int i = 0; i < final_mesh.num_elems; i++) { @@ -2241,7 +2253,17 @@ void partition_mesh( value /= final_mesh.num_elems_in_elem(i); gauss_point.fields.host(i) = value; } - gauss_point.fields.update_device(); + for (int i = 0; i < final_mesh.num_elems; i++) { + double value = 0.0; + for (int j = 0; j < final_mesh.num_elems_in_elem(i); j++) { + value += gauss_point.fields_vec.host(final_mesh.elems_in_elem(i, j), 0); + } + value /= final_mesh.num_elems_in_elem(i); + gauss_point.fields_vec.host(i, 0) = value; + gauss_point.fields_vec.host(i, 1) = value; + gauss_point.fields_vec.host(i, 2) = value; + } + gauss_point.fields_vec.update_device(); diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h index 77dac8d0..8170e531 100644 --- a/examples/mesh_decomp/mesh_io.h +++ b/examples/mesh_decomp/mesh_io.h @@ -513,7 +513,7 @@ void write_vtu(Mesh_t& mesh, Kokkos::fence(); const int num_cell_scalar_vars = 4; - const int num_cell_vec_vars = 0; + const int num_cell_vec_vars = 1; const int num_cell_tensor_vars = 0; const int num_point_scalar_vars = 3; @@ -524,6 +524,10 @@ void write_vtu(Mesh_t& mesh, "rank_id", "elems_in_elem_owned", "global_elem_id", "field_value" }; + const char cell_vec_var_names[num_cell_vec_vars][15] = { + "field_vec" + }; + const char point_scalar_var_names[num_point_scalar_vars][15] = { "rank_id", "elems_in_node", "global_node_id" }; @@ -539,12 +543,16 @@ void write_vtu(Mesh_t& mesh, // save the cell state to an array for exporting to graphics files auto elem_fields = CArray(num_elems, num_cell_scalar_vars); - + auto elem_vec_fields = CArray(num_elems, num_cell_vec_vars, 3); + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { elem_fields(elem_gid, 0) = rank; elem_fields(elem_gid, 1) = (double)mesh.num_elems_in_elem(elem_gid); elem_fields(elem_gid, 2) = mesh.local_to_global_elem_mapping.host(elem_gid); elem_fields(elem_gid, 3) = gauss_point.fields.host(elem_gid); + elem_vec_fields(elem_gid, 0, 0) = gauss_point.fields_vec.host(elem_gid, 0); + elem_vec_fields(elem_gid, 0, 1) = gauss_point.fields_vec.host(elem_gid, 1); + elem_vec_fields(elem_gid, 0, 2) = gauss_point.fields_vec.host(elem_gid, 2); } // save the vertex vector fields to an array for exporting to graphics files @@ -670,6 +678,22 @@ void write_vtu(Mesh_t& mesh, // Write CellData (element fields) fprintf(vtu_file, " \n"); + + // Cell vector variables + for (int var = 0; var < num_cell_vec_vars; var++) { + fprintf(vtu_file, " \n", + cell_vec_var_names[var]); + for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { + // TODO: Populate cell vector field data from appropriate source + fprintf(vtu_file, " %f %f %f\n", + gauss_point.fields_vec.host(elem_gid, 0), + gauss_point.fields_vec.host(elem_gid, 1), + gauss_point.fields_vec.host(elem_gid, 2)); + } + fprintf(vtu_file, " \n"); + } + + // Cell scalar variables for (int var = 0; var < num_cell_scalar_vars; var++) { fprintf(vtu_file, " \n", cell_scalar_var_names[var]); @@ -730,6 +754,10 @@ void write_vtu(Mesh_t& mesh, // Write PCellData fprintf(pvtu_file, " \n"); + for (int var = 0; var < num_cell_vec_vars; var++) { + fprintf(pvtu_file, " \n", + cell_vec_var_names[var]); + } for (int var = 0; var < num_cell_scalar_vars; var++) { fprintf(pvtu_file, " \n", cell_scalar_var_names[var]); diff --git a/examples/mesh_decomp/mpi_type.h b/examples/mesh_decomp/mpi_type.h index 54766d31..858705d7 100644 --- a/examples/mesh_decomp/mpi_type.h +++ b/examples/mesh_decomp/mpi_type.h @@ -6,6 +6,69 @@ using namespace mtr; +// Add this before the MPICArrayKokkos class definition + +// Type trait to map C++ types to MPI_Datatype +template +struct mpi_type_map { + static MPI_Datatype value() { + static_assert(sizeof(T) == 0, "Unsupported type for MPI communication"); + return MPI_DATATYPE_NULL; + } +}; + +// Specializations for common types +template <> +struct mpi_type_map { + static MPI_Datatype value() { return MPI_INT; } +}; + +template <> +struct mpi_type_map { + static MPI_Datatype value() { return MPI_LONG; } +}; + +template <> +struct mpi_type_map { + static MPI_Datatype value() { return MPI_LONG_LONG; } +}; + +template <> +struct mpi_type_map { + static MPI_Datatype value() { return MPI_UNSIGNED; } +}; + +template <> +struct mpi_type_map { + static MPI_Datatype value() { return MPI_UNSIGNED_LONG; } +}; + +template <> +struct mpi_type_map { + static MPI_Datatype value() { return MPI_FLOAT; } +}; + +template <> +struct mpi_type_map { + static MPI_Datatype value() { return MPI_DOUBLE; } +}; + +template <> +struct mpi_type_map { + static MPI_Datatype value() { return MPI_CHAR; } +}; + +template <> +struct mpi_type_map { + static MPI_Datatype value() { return MPI_UNSIGNED_CHAR; } +}; + +template <> +struct mpi_type_map { + static MPI_Datatype value() { return MPI_C_BOOL; } +}; + + ///////////////////////// // MPICArrayKokkos: Dual type for managing distributed data on both CPU and GPU. // @@ -39,6 +102,8 @@ class MPICArrayKokkos { DCArrayKokkos send_displs_; // [size: num_send_ranks] Starting index of items to send to each rank DCArrayKokkos recv_displs_; // [size: num_recv_ranks] Starting index of items to receive from each rank + size_t stride_; // [size: num_dims] Number of contiguous values per first index element + DRaggedRightArrayKokkos send_indices_; // [size: num_send_ranks, num_items_to_send_by_rank] Indices of items to send to each rank DRaggedRightArrayKokkos recv_indices_; // [size: num_recv_ranks, num_items_to_recv_by_rank] Indices of items to receive from each rank @@ -46,8 +111,6 @@ class MPICArrayKokkos { size_t num_owned_; // Number of owned items (nodes/elements) size_t num_ghost_; // Number of ghost items (nodes/elements) - - void set_mpi_type(); public: // Data member to access host view @@ -104,13 +167,43 @@ class MPICArrayKokkos { // Method to set comm plan - KOKKOS_INLINE_FUNCTION void initialize_comm_plan(CommunicationPlan& comm_plan){ comm_plan_ = &comm_plan; - send_buffer_ = DCArrayKokkos(comm_plan_->total_send_count, "send_buffer"); - recv_buffer_ = DCArrayKokkos(comm_plan_->total_recv_count, "recv_buffer"); - }; + + size_t send_size = comm_plan_->total_send_count * stride_; + size_t recv_size = comm_plan_->total_recv_count * stride_; + + if (send_size > 0) { + send_buffer_ = DCArrayKokkos(send_size, "send_buffer"); + } + if (recv_size > 0) { + recv_buffer_ = DCArrayKokkos(recv_size, "recv_buffer"); + } + if (comm_plan_->num_send_ranks > 0) { + send_counts_ = DCArrayKokkos(comm_plan_->num_send_ranks, "send_counts"); + send_displs_ = DCArrayKokkos(comm_plan_->num_send_ranks, "send_displs"); + + for(int i = 0; i < comm_plan_->num_send_ranks; i++){ + send_counts_.host(i) = comm_plan_->send_counts_.host(i) * stride_; + send_displs_.host(i) = comm_plan_->send_displs_.host(i) * stride_; + } + send_counts_.update_device(); + send_displs_.update_device(); + } + + if (comm_plan_->num_recv_ranks > 0) { + recv_counts_ = DCArrayKokkos(comm_plan_->num_recv_ranks, "recv_counts"); + recv_displs_ = DCArrayKokkos(comm_plan_->num_recv_ranks, "recv_displs"); + + for(int i = 0; i < comm_plan_->num_recv_ranks; i++){ + recv_counts_.host(i) = comm_plan_->recv_counts_.host(i) * stride_; + recv_displs_.host(i) = comm_plan_->recv_displs_.host(i) * stride_; + } + recv_counts_.update_device(); + recv_displs_.update_device(); + } + }; // GPU Method @@ -147,228 +240,145 @@ class MPICArrayKokkos { // Method that update device view void update_device(); - // Method that builds the send buffer + // Method that builds the send buffer, note, this has to be ordered + // Such that all the boundary elements going to a given rank are contiguous in the send buffer. void fill_send_buffer(){ - int rank; - MPI_Comm_rank(comm_plan_->mpi_comm_world, &rank); - // this_array_.update_host(); - int send_idx = 0; + + T* src_ptr = this_array_.host_pointer(); + + + size_t send_idx = 0; for(int i = 0; i < comm_plan_->num_send_ranks; i++){ for(int j = 0; j < comm_plan_->send_counts_.host(i); j++){ - int src_idx = comm_plan_->send_indices_.host(i, j); - send_buffer_.host(send_idx) = this_array_.host(src_idx); - if(rank == 0) std::cout << "MPICArrayKokkos::fill_send_buffer() - send_buffer(" << send_idx << ") = " << this_array_.host(src_idx) << std::endl; - send_idx++; + size_t src_idx = comm_plan_->send_indices_.host(i, j); // index of the element to send + + // Copy all values associated with this element (handles multi-dimensional arrays) + for(size_t k = 0; k < stride_; k++){ + send_buffer_.host(send_idx + k) = src_ptr[src_idx * stride_ + k]; + } + send_idx += stride_; } } }; - // Method that copies the recv buffer + // Method that copies the recv buffer into the this_array void copy_recv_buffer(){ - int rank; - MPI_Comm_rank(comm_plan_->mpi_comm_world, &rank); - - // NOTE: Do NOT call recv_buffer_.update_host() here! - // MPI already wrote directly to host memory, so calling update_host() - // would overwrite the received data by copying stale device data - int recv_idx = 0; + + T* dest_ptr = this_array_.host_pointer(); + + size_t recv_idx = 0; for(int i = 0; i < comm_plan_->num_recv_ranks; i++){ for(int j = 0; j < comm_plan_->recv_counts_.host(i); j++){ - int dest_idx = comm_plan_->recv_indices_.host(i, j); - this_array_.host(dest_idx) = recv_buffer_.host(recv_idx); - //if(rank == 0) std::cout << "MPICArrayKokkos::copy_recv_buffer() - this_array(" << dest_idx << ") = " << recv_buffer_.host(recv_idx) << std::endl; - recv_idx++; + size_t dest_idx = comm_plan_->recv_indices_.host(i, j); + + // Copy all values associated with this element (handles multi-dimensional arrays) + for(size_t k = 0; k < stride_; k++){ + dest_ptr[dest_idx * stride_ + k] = recv_buffer_.host(recv_idx + k); + } + + recv_idx += stride_; } } + this_array_.update_device(); }; - void communicate(){ - int rank; - MPI_Comm_rank(comm_plan_->mpi_comm_world, &rank); - - if(rank == 0) { - std::cout << "MPICArrayKokkos::communicate() - this_array size: " << this_array_.size() << std::endl; - std::cout << "MPICArrayKokkos::communicate() - send_buffer size: " << send_buffer_.size() - << ", recv_buffer size: " << recv_buffer_.size() << std::endl; - std::cout << "MPICArrayKokkos::communicate() - total_send_count: " << comm_plan_->total_send_count - << ", total_recv_count: " << comm_plan_->total_recv_count << std::endl; - } - - fill_send_buffer(); - - if(rank == 0) std::cout << "MPICArrayKokkos::communicate() - Starting MPI_Neighbor_alltoallv" << std::endl; - - MPI_Barrier(comm_plan_->mpi_comm_world); - - // Verify buffer sizes match expected - if(rank == 0) { - std::cout << "Send buffer size check: " << send_buffer_.size() << " vs expected " << comm_plan_->total_send_count << std::endl; - std::cout << "Recv buffer size check: " << recv_buffer_.size() << " vs expected " << comm_plan_->total_recv_count << std::endl; - - // Print first few send values - std::cout << "MPICArrayKokkos::communicate() - send_buffer values: "; - for(int i = 0; i < 10 && i < send_buffer_.size(); i++) { - std::cout << send_buffer_.host(i) << " "; - } - std::cout << std::endl; - - // Print send counts and displs - std::cout << "Send counts: "; - int total_send = 0; - for(int i = 0; i < comm_plan_->num_send_ranks; i++) { - int count = comm_plan_->send_counts_.host(i); - std::cout << count << " "; - total_send += count; - } - std::cout << "(total=" << total_send << ")" << std::endl; - - std::cout << "Send displs: "; - for(int i = 0; i < comm_plan_->num_send_ranks; i++) { - std::cout << comm_plan_->send_displs_.host(i) << " "; - } - std::cout << std::endl; - - // Print recv counts and displs - std::cout << "Recv counts: "; - int total_recv = 0; - for(int i = 0; i < comm_plan_->num_recv_ranks; i++) { - int count = comm_plan_->recv_counts_.host(i); - std::cout << count << " "; - total_recv += count; - } - std::cout << "(total=" << total_recv << ")" << std::endl; - - std::cout << "Recv displs: "; - for(int i = 0; i < comm_plan_->num_recv_ranks; i++) { - std::cout << comm_plan_->recv_displs_.host(i) << " "; - } - std::cout << std::endl; - } - - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout << "MPICArrayKokkos::communicate() calling MPI_Neighbor_alltoallv"< 0) ? &send_buffer_.host(0) : nullptr; - T* recv_buf_ptr = (recv_buffer_.size() > 0) ? &recv_buffer_.host(0) : nullptr; - int* send_cnt_ptr = (comm_plan_->num_send_ranks > 0) ? &comm_plan_->send_counts_.host(0) : nullptr; - int* send_dsp_ptr = (comm_plan_->num_send_ranks > 0) ? &comm_plan_->send_displs_.host(0) : nullptr; - int* recv_cnt_ptr = (comm_plan_->num_recv_ranks > 0) ? &comm_plan_->recv_counts_.host(0) : nullptr; - int* recv_dsp_ptr = (comm_plan_->num_recv_ranks > 0) ? &comm_plan_->recv_displs_.host(0) : nullptr; - - if(rank == 0) { - std::cout << "Pointer addresses:" << std::endl; - std::cout << " send_buf_ptr = " << (void*)send_buf_ptr << std::endl; - std::cout << " send_cnt_ptr = " << (void*)send_cnt_ptr << std::endl; - std::cout << " send_dsp_ptr = " << (void*)send_dsp_ptr << std::endl; - std::cout << " recv_buf_ptr = " << (void*)recv_buf_ptr << std::endl; - std::cout << " recv_cnt_ptr = " << (void*)recv_cnt_ptr << std::endl; - std::cout << " recv_dsp_ptr = " << (void*)recv_dsp_ptr << std::endl; - } - MPI_Barrier(MPI_COMM_WORLD); + // T* send_buf_ptr = (send_buffer_.size() > 0) ? &send_buffer_.host(0) : nullptr; + // T* recv_buf_ptr = (recv_buffer_.size() > 0) ? &recv_buffer_.host(0) : nullptr; + // int* send_cnt_ptr = (comm_plan_->num_send_ranks > 0) ? &comm_plan_->send_counts_.host(0) : nullptr; + // int* send_dsp_ptr = (comm_plan_->num_send_ranks > 0) ? &comm_plan_->send_displs_.host(0) : nullptr; + // int* recv_cnt_ptr = (comm_plan_->num_recv_ranks > 0) ? &comm_plan_->recv_counts_.host(0) : nullptr; + // int* recv_dsp_ptr = (comm_plan_->num_recv_ranks > 0) ? &comm_plan_->recv_displs_.host(0) : nullptr; + + // Method that communicates the data between the ranks + void communicate(){ + + this_array_.update_host(); + + fill_send_buffer(); MPI_Neighbor_alltoallv( - &send_buffer_.host(0), - &comm_plan_->send_counts_.host(0), - &comm_plan_->send_displs_.host(0), - MPI_DOUBLE, - &recv_buffer_.host(0), - &comm_plan_->recv_counts_.host(0), - &comm_plan_->recv_displs_.host(0), - MPI_DOUBLE, + send_buffer_.host_pointer(), + send_counts_.host_pointer(), + send_displs_.host_pointer(), + mpi_type_map::value(), // MPI_TYPE + recv_buffer_.host_pointer(), + recv_counts_.host_pointer(), + recv_displs_.host_pointer(), + mpi_type_map::value(), // MPI_TYPE comm_plan_->mpi_comm_graph); - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout << "MPICArrayKokkos::communicate() finished MPI_Neighbor_alltoallv"< MPICArrayKokkos::MPICArrayKokkos() - : this_array_() { } + : this_array_(), stride_(1) { } // Overloaded 1D constructor template -MPICArrayKokkos::MPICArrayKokkos(size_t dim0, const std::string& tag_string) { +MPICArrayKokkos::MPICArrayKokkos(size_t dim0, const std::string& tag_string) + : stride_(1) { this_array_ = DCArrayKokkos(dim0, tag_string); host = ViewCArray (this_array_.host_pointer(), dim0); } // Overloaded 2D constructor template -MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string) { +MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string) + : stride_(dim1) { this_array_ = DCArrayKokkos(dim0, dim1, tag_string); host = ViewCArray (this_array_.host_pointer(), dim0, dim1); } // Overloaded 3D constructor template -MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string) { +MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string) + : stride_(dim1 * dim2) { this_array_ = DCArrayKokkos(dim0, dim1, dim2, tag_string); host = ViewCArray (this_array_.host_pointer(), dim0, dim1, dim2); } // Overloaded 4D constructor template -MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, const std::string& tag_string) { +MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, const std::string& tag_string) + : stride_(dim1 * dim2 * dim3) { this_array_ = DCArrayKokkos(dim0, dim1, dim2, dim3, tag_string); host = ViewCArray (this_array_.host_pointer(), dim0, dim1, dim2, dim3); } // Overloaded 5D constructor template -MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, const std::string& tag_string) { +MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, const std::string& tag_string) + : stride_(dim1 * dim2 * dim3 * dim4) { this_array_ = DCArrayKokkos(dim0, dim1, dim2, dim3, dim4, tag_string); host = ViewCArray (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4); } // Overloaded 6D constructor template -MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string) { +MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string) + : stride_(dim1 * dim2 * dim3 * dim4 * dim5) { this_array_ = DCArrayKokkos(dim0, dim1, dim2, dim3, dim4, dim5, tag_string); host = ViewCArray (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4, dim5); } // Overloaded 7D constructor template -MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, size_t dim6, const std::string& tag_string) { +MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, size_t dim6, const std::string& tag_string) + : stride_(dim1 * dim2 * dim3 * dim4 * dim5 * dim6) { this_array_ = DCArrayKokkos(dim0, dim1, dim2, dim3, dim4, dim5, dim6, tag_string); host = ViewCArray (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4, dim5, dim6); } @@ -459,6 +469,7 @@ MPICArrayKokkos& MPICArrayKokkos fields; - + + MPICArrayKokkos fields_vec; // initialization method (num_cells, num_dims) void initialize(size_t num_gauss_pnts, size_t num_dims, std::vector gauss_pt_states, CommunicationPlan& comm_plan) @@ -115,6 +117,12 @@ struct GaussPoint_t this->fields.initialize_comm_plan(comm_plan); } break; + case gauss_pt_state::fields_vec: + if (fields_vec.size() == 0){ + this->fields_vec = MPICArrayKokkos(num_gauss_pnts, num_dims, "gauss_point_fields_vec"); + this->fields_vec.initialize_comm_plan(comm_plan); + } + break; default: std::cout<<"Desired gauss point state not understood in GaussPoint_t initialize"< Date: Thu, 6 Nov 2025 16:29:24 -0600 Subject: [PATCH 22/52] STYLE: Renaming a thing and headed home --- examples/mesh_decomp/mpi_type.h | 52 ++++++++++++++++++++++++--------- examples/mesh_decomp/state.h | 6 +--- 2 files changed, 39 insertions(+), 19 deletions(-) diff --git a/examples/mesh_decomp/mpi_type.h b/examples/mesh_decomp/mpi_type.h index 858705d7..c49977c0 100644 --- a/examples/mesh_decomp/mpi_type.h +++ b/examples/mesh_decomp/mpi_type.h @@ -166,8 +166,8 @@ class MPICArrayKokkos { MPICArrayKokkos& operator=(const MPICArrayKokkos& temp); - // Method to set comm plan - void initialize_comm_plan(CommunicationPlan& comm_plan){ + // Method to set comm plan for halo communication + void initialize_mesh_comm_plan(CommunicationPlan& comm_plan){ comm_plan_ = &comm_plan; size_t send_size = comm_plan_->total_send_count * stride_; @@ -244,11 +244,6 @@ class MPICArrayKokkos { // Such that all the boundary elements going to a given rank are contiguous in the send buffer. void fill_send_buffer(){ - - - T* src_ptr = this_array_.host_pointer(); - - size_t send_idx = 0; for(int i = 0; i < comm_plan_->num_send_ranks; i++){ for(int j = 0; j < comm_plan_->send_counts_.host(i); j++){ @@ -256,7 +251,7 @@ class MPICArrayKokkos { // Copy all values associated with this element (handles multi-dimensional arrays) for(size_t k = 0; k < stride_; k++){ - send_buffer_.host(send_idx + k) = src_ptr[src_idx * stride_ + k]; + send_buffer_.host(send_idx + k) = this_array_.host_pointer()[src_idx * stride_ + k]; } send_idx += stride_; } @@ -265,9 +260,7 @@ class MPICArrayKokkos { // Method that copies the recv buffer into the this_array void copy_recv_buffer(){ - - T* dest_ptr = this_array_.host_pointer(); - + size_t recv_idx = 0; for(int i = 0; i < comm_plan_->num_recv_ranks; i++){ for(int j = 0; j < comm_plan_->recv_counts_.host(i); j++){ @@ -275,7 +268,7 @@ class MPICArrayKokkos { // Copy all values associated with this element (handles multi-dimensional arrays) for(size_t k = 0; k < stride_; k++){ - dest_ptr[dest_idx * stride_ + k] = recv_buffer_.host(recv_idx + k); + this_array_.host_pointer()[dest_idx * stride_ + k] = recv_buffer_.host(recv_idx + k); } recv_idx += stride_; @@ -317,6 +310,38 @@ class MPICArrayKokkos { this_array_.update_device(); }; + + + // MPI send wrapper + void send(size_t count, int dest, int tag, MPI_Comm comm); + + // MPI recieve wrapper + void recv(size_t count, int dest, int tag, MPI_Comm comm); + + // MPI broadcast wrapper + void broadcast(size_t count, int root, MPI_Comm comm); + + // MPI scatter wrapper + void scatter(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm); + + // MPI gather wrapper + void gather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm); + + // MPI allgather wrapper + void allgather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, MPI_Comm comm); + + // MPI send wrapper + void isend(size_t count, int dest, int tag, MPI_Comm comm); + + // MPI recieve wrapper + void irecv(size_t count, int dest, int tag, MPI_Comm comm); + + // MPI wait wrapper for sender + void wait_send(); + + // MPI wait wrapper for receiver + void wait_recv(); + // Deconstructor virtual KOKKOS_INLINE_FUNCTION ~MPICArrayKokkos (); @@ -531,8 +556,7 @@ void MPICArrayKokkos::update_device() { template KOKKOS_INLINE_FUNCTION MPICArrayKokkos::~MPICArrayKokkos() { - // Member variables (this_array_, send_buffer_, recv_buffer_) are automatically - // destroyed by the compiler - no explicit cleanup needed + } #endif \ No newline at end of file diff --git a/examples/mesh_decomp/state.h b/examples/mesh_decomp/state.h index 556039da..385723ed 100644 --- a/examples/mesh_decomp/state.h +++ b/examples/mesh_decomp/state.h @@ -97,11 +97,7 @@ enum class gauss_pt_state struct GaussPoint_t { - //DCArrayKokkos fields; ///< GaussPoint fields - - MPICArrayKokkos fields; - MPICArrayKokkos fields_vec; // initialization method (num_cells, num_dims) @@ -120,7 +116,7 @@ struct GaussPoint_t case gauss_pt_state::fields_vec: if (fields_vec.size() == 0){ this->fields_vec = MPICArrayKokkos(num_gauss_pnts, num_dims, "gauss_point_fields_vec"); - this->fields_vec.initialize_comm_plan(comm_plan); + this->fields_vec.initialize_mesh_comm_plan(comm_plan); } break; default: From 4447a5ccc614eece4e23610bfa2f2941ec5760fc Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Fri, 7 Nov 2025 14:09:04 -0600 Subject: [PATCH 23/52] ENH: Tidying up --- examples/mesh_decomp/communication_plan.h | 3 - examples/mesh_decomp/decomp_utils.h | 259 +++++++--------------- examples/mesh_decomp/mpi_type.h | 105 ++++++--- examples/mesh_decomp/state.h | 2 +- 4 files changed, 153 insertions(+), 216 deletions(-) diff --git a/examples/mesh_decomp/communication_plan.h b/examples/mesh_decomp/communication_plan.h index 16904e57..63391262 100644 --- a/examples/mesh_decomp/communication_plan.h +++ b/examples/mesh_decomp/communication_plan.h @@ -59,9 +59,6 @@ using namespace mtr; int reorder = 0; - - - DRaggedRightArrayKokkos send_indices_; // [size: num_send_ranks, num_items_to_send_per_rank] Indices of items to send to each rank DRaggedRightArrayKokkos recv_indices_; // [size: num_recv_ranks, num_items_to_recv_per_rank] Indices of items to receive from each rank diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h index e3421259..3c2cefc4 100644 --- a/examples/mesh_decomp/decomp_utils.h +++ b/examples/mesh_decomp/decomp_utils.h @@ -22,30 +22,19 @@ #include "ptscotch.h" - - - - -void partition_mesh( +void naive_partition_mesh( Mesh_t& initial_mesh, - Mesh_t& final_mesh, node_t& initial_node, - node_t& final_node, - GaussPoint_t& gauss_point, + Mesh_t& naive_mesh, + node_t& naive_node, + std::vector& elems_in_elem_on_rank, + std::vector& num_elems_in_elem_per_rank, int world_size, - int rank){ + int rank) +{ - bool print_info = false; - bool print_vtk = false; - // Create mesh, gauss points, and node data structures on each rank - // This is the initial partitioned mesh - Mesh_t naive_mesh; - node_t naive_node; - - // Mesh partitioned by pt-scotch, not including ghost - Mesh_t intermediate_mesh; - node_t intermediate_node; + bool print_info = false; int num_elements_on_rank = 0; int num_nodes_on_rank = 0; @@ -68,10 +57,6 @@ void partition_mesh( // Create a 2D vector to hold the nodal positions on each rank std::vector> node_pos_to_send(world_size); - // create a 2D vector to hold the node positions on each rank - std::vector> node_pos_on_rank(world_size); - - if (rank == 0) { num_nodes_per_elem = initial_mesh.num_nodes_in_elem; @@ -89,9 +74,9 @@ void partition_mesh( MPI_Bcast(&num_nodes_per_elem, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); -// ******************************************************** -// Scatter the number of elements to each rank -// ******************************************************** + // ******************************************************** + // Scatter the number of elements to each rank + // ******************************************************** // All ranks participate in the scatter operation // MPI_Scatter signature: // MPI_Scatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype, @@ -110,9 +95,9 @@ void partition_mesh( MPI_Barrier(MPI_COMM_WORLD); double t_scatter_end = MPI_Wtime(); -// ******************************************************** -// Scatter the actual element global ids to each rank -// ******************************************************** + // ******************************************************** + // Scatter the actual element global ids to each rank + // ******************************************************** double t_scatter_gids_start = MPI_Wtime(); if (rank == 0) { @@ -169,34 +154,10 @@ void partition_mesh( // Wait for all ranks to complete the scatter operation MPI_Barrier(MPI_COMM_WORLD); - // Timer: End measuring time for scattering element global ids - double t_scatter_gids_end = MPI_Wtime(); - if(rank == 0 && print_info) { - std::cout<<" Finished scattering the actual element global ids to each rank"< node_pos_on_rank_flat(num_nodes_on_rank * 3); @@ -366,43 +305,9 @@ void partition_mesh( MPI_Barrier(MPI_COMM_WORLD); - if (rank == 0 && print_info) { - // Print out the node positions on this rank - std::cout << "Rank " << rank << " received node positions: "; - for (int i = 0; i < num_nodes_on_rank; i++) { - std::cout << "(" << node_pos_on_rank_flat[i*3] << ", " - << node_pos_on_rank_flat[i*3+1] << ", " - << node_pos_on_rank_flat[i*3+2] << ") "; - } - std::cout << std::endl; - } - - - MPI_Barrier(MPI_COMM_WORLD); - - if (rank == 1 && print_info) { - // Print out the node positions on this rank - std::cout << "Rank " << rank << " received node positions: "; - for (int i = 0; i < num_nodes_on_rank; i++) { - std::cout << "(" << node_pos_on_rank_flat[i*3] << ", " - << node_pos_on_rank_flat[i*3+1] << ", " - << node_pos_on_rank_flat[i*3+2] << ") "; - } - std::cout << std::endl; - } - - MPI_Barrier(MPI_COMM_WORLD); - - double t_scatter_nodepos_end = MPI_Wtime(); - if(rank == 0) { - std::cout<<" Finished scattering the node positions to each rank"< required_node_state = { node_state::coords }; @@ -417,9 +322,9 @@ void partition_mesh( naive_node.coords.update_device(); -// ****************************************************************************************** -// Send the element-node connectivity data from the initial mesh to each rank -// ****************************************************************************************** + // ****************************************************************************************** + // Send the element-node connectivity data from the initial mesh to each rank + // ****************************************************************************************** // Send the element-node connectivity data from the initial mesh to each rank std::vector nodes_in_elem_on_rank(num_elements_on_rank * num_nodes_per_elem); @@ -457,38 +362,15 @@ void partition_mesh( 0, MPI_COMM_WORLD); } - MPI_Barrier(MPI_COMM_WORLD); - - double t_scatter_elemnode_end = MPI_Wtime(); - if(rank == 0) { - std::cout << " Finished scattering the element-node connectivity data from the initial mesh to each rank" << std::endl; - std::cout << " Scattering element-node connectivity took " - << (t_scatter_elemnode_end - t_scatter_elemnode_start) << " seconds." << std::endl; - } - - if (rank == 0 && print_info) { - - std::cout << "Rank " << rank << " received element-node connectivity (" - << num_elements_on_rank << " elements, " << nodes_in_elem_on_rank.size() << " entries):" << std::endl; - for (int elem = 0; elem < num_elements_on_rank; elem++) { - std::cout << " Element " << elem << " nodes: "; - for (int node = 0; node < num_nodes_per_elem; node++) { - int idx = elem * num_nodes_per_elem + node; - std::cout << nodes_in_elem_on_rank[idx] << " "; - } - std::cout << std::endl; - } - } - MPI_Barrier(MPI_COMM_WORLD); if(rank == 0) std::cout<<" Finished scattering the element-node connectivity data from the initial mesh to each rank"< elems_in_elem_on_rank(total_elem_elem_entries); + elems_in_elem_on_rank.resize(total_elem_elem_entries); // Now scatter the num_elems_in_elem for each element on each rank - std::vector num_elems_in_elem_per_rank(num_elements_on_rank); + num_elems_in_elem_per_rank.resize(num_elements_on_rank); if (rank == 0) { std::vector all_num_elems_in_elem; @@ -645,9 +508,9 @@ void partition_mesh( MPI_Barrier(MPI_COMM_WORLD); -// ****************************************************************************************** -// Initialize the naive_mesh data structures for each rank -// ****************************************************************************************** + // ****************************************************************************************** + // Initialize the naive_mesh data structures for each rank + // ****************************************************************************************** naive_mesh.initialize_nodes(num_nodes_on_rank); naive_mesh.initialize_elems(num_elements_on_rank, 3); @@ -714,12 +577,40 @@ void partition_mesh( naive_mesh.build_connectivity(); MPI_Barrier(MPI_COMM_WORLD); + + return; +} + +void partition_mesh( + Mesh_t& initial_mesh, + Mesh_t& final_mesh, + node_t& initial_node, + node_t& final_node, + GaussPoint_t& gauss_point, + int world_size, + int rank){ + + bool print_info = false; + bool print_vtk = false; + + // Create mesh, gauss points, and node data structures on each rank + // This is the initial partitioned mesh + Mesh_t naive_mesh; + node_t naive_node; + + // Mesh partitioned by pt-scotch, not including ghost + Mesh_t intermediate_mesh; + node_t intermediate_node; + + + // Helper arrays to hold element-element connectivity for naive partitioning that include what would be ghost, without having to build the full mesh + std::vector elems_in_elem_on_rank; + std::vector num_elems_in_elem_per_rank; + + naive_partition_mesh(initial_mesh, initial_node, naive_mesh, naive_node, elems_in_elem_on_rank, num_elems_in_elem_per_rank, world_size, rank); - // if (print_vtk) { - // write_vtk(naive_mesh, naive_node, rank); - // } @@ -784,7 +675,7 @@ void partition_mesh( * neighbors it has. * - elems_in_elem_on_rank: flattened array of global neighbor IDs for all local elements. * - **********************************************************************************/ + **********************************************************************************/ // --- Step 1: Initialize the PT-Scotch distributed graph object on this MPI rank --- SCOTCH_Dgraph dgraph; @@ -814,9 +705,10 @@ void partition_mesh( // This allows, for a given element GID, quick lookup of where its neighbor list starts in the flat array. std::map elem_gid_to_offset; size_t current_offset = 0; - for (size_t k = 0; k < num_elements_on_rank; k++) { - elem_gid_to_offset[elements_on_rank[k]] = current_offset; - current_offset += num_elems_in_elem_per_rank[k]; + for (size_t k = 0; k < naive_mesh.num_elems; k++) { + int elem_gid_on_rank = naive_mesh.local_to_global_elem_mapping.host(k); + elem_gid_to_offset[elem_gid_on_rank] = current_offset; + current_offset += num_elems_in_elem_per_rank[k]; // WARNING< THIS MUST INCLUDE GHOST< WHICH DONT EXISTS ON THE NAIVE MESH } // --- Step 3: Fill in the CSR arrays, looping over each locally-owned element --- @@ -836,8 +728,9 @@ void partition_mesh( // For this element, find the count of its neighbors // This requires finding its index in the elements_on_rank array size_t idx = 0; - for (size_t k = 0; k < num_elements_on_rank; k++) { - if (elements_on_rank[k] == elem_gid) { + for (size_t k = 0; k < naive_mesh.num_elems; k++) { + int elem_gid_on_rank = naive_mesh.local_to_global_elem_mapping.host(k); + if (elem_gid_on_rank == elem_gid) { idx = k; break; } diff --git a/examples/mesh_decomp/mpi_type.h b/examples/mesh_decomp/mpi_type.h index c49977c0..98f62313 100644 --- a/examples/mesh_decomp/mpi_type.h +++ b/examples/mesh_decomp/mpi_type.h @@ -116,6 +116,12 @@ class MPICArrayKokkos { // Data member to access host view ViewCArray host; + + // Note, consider this for sending blocks without dealing with stride_ + // MPI_Datatype vector_type; + // MPI_Type_contiguous(stride_, mpi_type_map::value(), &vector_type); + // MPI_Type_commit(&vector_type); + MPICArrayKokkos(); MPICArrayKokkos(size_t dim0, const std::string& tag_string = DEFAULTSTRINGARRAY); @@ -167,7 +173,7 @@ class MPICArrayKokkos { // Method to set comm plan for halo communication - void initialize_mesh_comm_plan(CommunicationPlan& comm_plan){ + void initialize_comm_plan(CommunicationPlan& comm_plan){ comm_plan_ = &comm_plan; size_t send_size = comm_plan_->total_send_count * stride_; @@ -288,6 +294,8 @@ class MPICArrayKokkos { // int* recv_dsp_ptr = (comm_plan_->num_recv_ranks > 0) ? &comm_plan_->recv_displs_.host(0) : nullptr; // Method that communicates the data between the ranks + // NOTE: This is a blocking communication operation, + // if you want to use non-blocking communication, you can use the following: MPI_Ineighbor_alltoallv void communicate(){ this_array_.update_host(); @@ -310,37 +318,43 @@ class MPICArrayKokkos { this_array_.update_device(); }; + void set_values(const T& value){ + this_array_.set_values(value); + }; + + void reduce_sum(T& result){}; - // MPI send wrapper - void send(size_t count, int dest, int tag, MPI_Comm comm); - // MPI recieve wrapper - void recv(size_t count, int dest, int tag, MPI_Comm comm); + // // MPI send wrapper + // void send(size_t count, int dest, int tag, MPI_Comm comm); - // MPI broadcast wrapper - void broadcast(size_t count, int root, MPI_Comm comm); + // // MPI recieve wrapper + // void recv(size_t count, int dest, int tag, MPI_Comm comm); - // MPI scatter wrapper - void scatter(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm); + // // MPI broadcast wrapper + // void broadcast(size_t count, int root, MPI_Comm comm); - // MPI gather wrapper - void gather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm); + // // MPI scatter wrapper + // void scatter(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm); - // MPI allgather wrapper - void allgather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, MPI_Comm comm); + // // MPI gather wrapper + // void gather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm); - // MPI send wrapper - void isend(size_t count, int dest, int tag, MPI_Comm comm); + // // MPI allgather wrapper + // void allgather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, MPI_Comm comm); - // MPI recieve wrapper - void irecv(size_t count, int dest, int tag, MPI_Comm comm); + // // MPI send wrapper + // void isend(size_t count, int dest, int tag, MPI_Comm comm); - // MPI wait wrapper for sender - void wait_send(); + // // MPI recieve wrapper + // void irecv(size_t count, int dest, int tag, MPI_Comm comm); - // MPI wait wrapper for receiver - void wait_recv(); + // // MPI wait wrapper for sender + // void wait_send(); + + // // MPI wait wrapper for receiver + // void wait_recv(); // Deconstructor virtual KOKKOS_INLINE_FUNCTION @@ -350,12 +364,17 @@ class MPICArrayKokkos { // Default constructor template MPICArrayKokkos::MPICArrayKokkos() - : this_array_(), stride_(1) { } + : this_array_(), stride_(1), length_(0), order_(0) { + for (int i = 0; i < 7; i++) { + dims_[i] = 0; + } + } // Overloaded 1D constructor template MPICArrayKokkos::MPICArrayKokkos(size_t dim0, const std::string& tag_string) - : stride_(1) { + : stride_(1), length_(dim0), order_(1) { + dims_[0] = dim0; this_array_ = DCArrayKokkos(dim0, tag_string); host = ViewCArray (this_array_.host_pointer(), dim0); } @@ -363,7 +382,10 @@ MPICArrayKokkos::MPICArrayKokkos(size_t dim0, c // Overloaded 2D constructor template MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string) - : stride_(dim1) { + : stride_(dim1), length_(dim0 * dim1), order_(2) { + dims_[0] = dim0; + dims_[1] = dim1; + this_array_ = DCArrayKokkos(dim0, dim1, tag_string); host = ViewCArray (this_array_.host_pointer(), dim0, dim1); } @@ -371,7 +393,10 @@ MPICArrayKokkos::MPICArrayKokkos(size_t dim0, s // Overloaded 3D constructor template MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string) - : stride_(dim1 * dim2) { + : stride_(dim1 * dim2), length_(dim0 * dim1 * dim2), order_(3) { + dims_[0] = dim0; + dims_[1] = dim1; + dims_[2] = dim2; this_array_ = DCArrayKokkos(dim0, dim1, dim2, tag_string); host = ViewCArray (this_array_.host_pointer(), dim0, dim1, dim2); } @@ -379,7 +404,11 @@ MPICArrayKokkos::MPICArrayKokkos(size_t dim0, s // Overloaded 4D constructor template MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, const std::string& tag_string) - : stride_(dim1 * dim2 * dim3) { + : stride_(dim1 * dim2 * dim3), length_(dim0 * dim1 * dim2 * dim3), order_(4) { + dims_[0] = dim0; + dims_[1] = dim1; + dims_[2] = dim2; + dims_[3] = dim3; this_array_ = DCArrayKokkos(dim0, dim1, dim2, dim3, tag_string); host = ViewCArray (this_array_.host_pointer(), dim0, dim1, dim2, dim3); } @@ -387,7 +416,12 @@ MPICArrayKokkos::MPICArrayKokkos(size_t dim0, s // Overloaded 5D constructor template MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, const std::string& tag_string) - : stride_(dim1 * dim2 * dim3 * dim4) { + : stride_(dim1 * dim2 * dim3 * dim4), length_(dim0 * dim1 * dim2 * dim3 * dim4), order_(5) { + dims_[0] = dim0; + dims_[1] = dim1; + dims_[2] = dim2; + dims_[3] = dim3; + dims_[4] = dim4; this_array_ = DCArrayKokkos(dim0, dim1, dim2, dim3, dim4, tag_string); host = ViewCArray (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4); } @@ -395,7 +429,13 @@ MPICArrayKokkos::MPICArrayKokkos(size_t dim0, s // Overloaded 6D constructor template MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string) - : stride_(dim1 * dim2 * dim3 * dim4 * dim5) { + : stride_(dim1 * dim2 * dim3 * dim4 * dim5), length_(dim0 * dim1 * dim2 * dim3 * dim4 * dim5), order_(6) { + dims_[0] = dim0; + dims_[1] = dim1; + dims_[2] = dim2; + dims_[3] = dim3; + dims_[4] = dim4; + dims_[5] = dim5; this_array_ = DCArrayKokkos(dim0, dim1, dim2, dim3, dim4, dim5, tag_string); host = ViewCArray (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4, dim5); } @@ -403,7 +443,14 @@ MPICArrayKokkos::MPICArrayKokkos(size_t dim0, s // Overloaded 7D constructor template MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, size_t dim6, const std::string& tag_string) - : stride_(dim1 * dim2 * dim3 * dim4 * dim5 * dim6) { + : stride_(dim1 * dim2 * dim3 * dim4 * dim5 * dim6), length_(dim0 * dim1 * dim2 * dim3 * dim4 * dim5 * dim6), order_(7) { + dims_[0] = dim0; + dims_[1] = dim1; + dims_[2] = dim2; + dims_[3] = dim3; + dims_[4] = dim4; + dims_[5] = dim5; + dims_[6] = dim6; this_array_ = DCArrayKokkos(dim0, dim1, dim2, dim3, dim4, dim5, dim6, tag_string); host = ViewCArray (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4, dim5, dim6); } diff --git a/examples/mesh_decomp/state.h b/examples/mesh_decomp/state.h index 385723ed..2ed970d5 100644 --- a/examples/mesh_decomp/state.h +++ b/examples/mesh_decomp/state.h @@ -116,7 +116,7 @@ struct GaussPoint_t case gauss_pt_state::fields_vec: if (fields_vec.size() == 0){ this->fields_vec = MPICArrayKokkos(num_gauss_pnts, num_dims, "gauss_point_fields_vec"); - this->fields_vec.initialize_mesh_comm_plan(comm_plan); + this->fields_vec.initialize_comm_plan(comm_plan); } break; default: From 588fec59005a2ac958314c28850c69b3d9f6697f Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Fri, 7 Nov 2025 14:47:48 -0600 Subject: [PATCH 24/52] ENH: Tidying up --- examples/mesh_decomp/communication_plan.h | 26 +-- examples/mesh_decomp/decomp_utils.h | 228 ++-------------------- examples/mesh_decomp/mesh_decomp.cpp | 13 +- 3 files changed, 31 insertions(+), 236 deletions(-) diff --git a/examples/mesh_decomp/communication_plan.h b/examples/mesh_decomp/communication_plan.h index 63391262..eabba8da 100644 --- a/examples/mesh_decomp/communication_plan.h +++ b/examples/mesh_decomp/communication_plan.h @@ -278,31 +278,11 @@ using namespace mtr; } this->recv_displs_.update_device(); - - // Print the send and recv data sequentially per MPI rank for clarity - MPI_Barrier(mpi_comm_world); - int rank, nprocs; - MPI_Comm_rank(mpi_comm_world, &rank); - MPI_Comm_size(mpi_comm_world, &nprocs); - for(int r = 0; r < nprocs; r++) { - MPI_Barrier(mpi_comm_world); - if(rank == r) { - std::cout << "==============================" << std::endl; - std::cout << "CommunicationPlan info for rank " << rank << std::endl; - for(int i = 0; i < num_send_ranks; i++){ - std::cout << " Send count to rank[" << i << "] (dest rank " << this->send_rank_ids.host(i) << "): " << this->send_counts_.host(i) << std::endl; - std::cout << " Send displs to rank[" << i << "]: " << this->send_displs_.host(i) << std::endl; - } - for(int i = 0; i < num_recv_ranks; i++){ - std::cout << " Recv count from rank[" << i << "] (source rank " << this->recv_rank_ids.host(i) << "): " << this->recv_counts_.host(i) << std::endl; - std::cout << " Recv displs from rank[" << i << "]: " << this->recv_displs_.host(i) << std::endl; - } - std::cout << "==============================" << std::endl << std::flush; - } - } MPI_Barrier(mpi_comm_world); } }; -#endif // COMMUNICATION_PLAN_H \ No newline at end of file +#endif // COMMUNICATION_PLAN_H + + diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h index 3c2cefc4..bab3f35d 100644 --- a/examples/mesh_decomp/decomp_utils.h +++ b/examples/mesh_decomp/decomp_utils.h @@ -33,7 +33,6 @@ void naive_partition_mesh( int rank) { - bool print_info = false; int num_elements_on_rank = 0; @@ -176,8 +175,6 @@ void naive_partition_mesh( } if (print_info) { - - std::cout< ghost_elem_receive_ranks; for (size_t i = 0; i < ghost_elem_gids_ordered.size(); ++i) { ghost_elem_receive_ranks.insert(ghost_elem_owner_ranks[i]); } - - // Print with ranks this rank will receive element data from sequentially - for (int r = 0; r < world_size; ++r) { - MPI_Barrier(MPI_COMM_WORLD); - if (rank == r) { - std::cout << "[rank " << rank << "] Ranks this rank will receive element data from: "; - for (int rank : ghost_elem_receive_ranks) { - std::cout << rank << " "; - } - std::cout << std::endl; - } - MPI_Barrier(MPI_COMM_WORLD); - } - - // ****************************************************************************************** // Build the final partitioned mesh // ****************************************************************************************** - - final_mesh.initialize_nodes(total_extended_nodes); final_mesh.initialize_elems(total_extended_elems, 3); final_mesh.local_to_global_node_mapping = DCArrayKokkos(total_extended_nodes); @@ -1643,25 +1554,10 @@ void partition_mesh( final_mesh.num_ghost_elems = ghost_elem_gids.size(); final_mesh.num_ghost_nodes = ghost_only_nodes.size(); - // Set owned counts for write_vtk (excludes ghost elements/nodes) + final_mesh.num_owned_elems = intermediate_mesh.num_elems; final_mesh.num_owned_nodes = intermediate_mesh.num_nodes; - - // Print num ghost elements and nodes on each rank sequentially - for (int r = 0; r < world_size; ++r) { - if (rank == r) { - std::cout << "*******[rank " << rank << "] - Ghost elements: " << final_mesh.num_ghost_elems << std::endl; - std::cout << "*******[rank " << rank << "] - Ghost-only nodes: " << final_mesh.num_ghost_nodes << std::endl; - } - MPI_Barrier(MPI_COMM_WORLD); - } - - - - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout<<" Starting reverse mapping of the element-node connectivity from the global node ids to the local node ids"< ranks that ghost it std::unordered_map> gid_to_ghosting_ranks; @@ -1853,38 +1742,8 @@ void partition_mesh( } } - std::cout.flush(); MPI_Barrier(MPI_COMM_WORLD); - - // Optional: print a compact summary of reverse map for verification (limited output) - for(int i = 0; i < world_size; i++) { - if (rank == i && print_info) { - std::cout << std::endl; - for (int elem_lid = 0; elem_lid < final_mesh.num_owned_elems; elem_lid++) { - - size_t local_elem_gid = final_mesh.local_to_global_elem_mapping.host(elem_lid); - if (boundary_elem_targets[elem_lid].empty()) - { - std::cout << "[rank " << rank << "] " << "elem_lid: "<< elem_lid <<" - elem_gid: " << local_elem_gid << " sends to: no ghost elements" << std::endl; - } - else - { - std::cout << "[rank " << rank << "] " << "elem_lid: "<< elem_lid <<" - elem_gid: " << local_elem_gid << " sends to: "; - int shown = 0; - for (const auto &pr : boundary_elem_targets[elem_lid]) { - if (shown >= 12) { std::cout << " ..."; break; } - std::cout << "(r" << pr.first << ":gid " << pr.second << ") "; - shown++; - } - std::cout << std::endl; - } - } - } - MPI_Barrier(MPI_COMM_WORLD); - } - - // Add a vector to store boundary element local_ids (those who have ghost destinations across ranks) std::vector boundary_elem_local_ids; std::vector> boundary_to_ghost_ranks; // ragged array dimensions (num_boundary_elems, num_ghost_ranks) @@ -1923,40 +1782,6 @@ void partition_mesh( MPI_Barrier(MPI_COMM_WORLD); - for (int r = 0; r < world_size; ++r) { - MPI_Barrier(MPI_COMM_WORLD); - if (rank == r) { - std::cout << std::endl; - std::cout << "[rank " << rank << "] elements communicates to ranks: "; - for (int i = 0; i < num_ghost_comm_ranks; ++i) { - std::cout << ghost_comm_ranks_vec[i] << " "; - } - std::cout << std::endl; - } - MPI_Barrier(MPI_COMM_WORLD); - } - - print_info = false; - - // Print out the boundary element local ids on each rank sequentially - for (int r = 0; r < world_size; ++r) { - MPI_Barrier(MPI_COMM_WORLD); - if (rank == r && print_info) { - std::cout << std::endl; - std::cout << "[rank " << rank << "] Boundary element global ids: " <(final_mesh.num_boundary_elems); for (int i = 0; i < final_mesh.num_boundary_elems; i++) { @@ -2084,27 +1909,9 @@ void partition_mesh( } } elems_to_recv_by_rank_rr.update_device(); - - // Debug: Print send vs recv counts per neighbor to diagnose mismatch - if (print_info) { - std::cout << "[rank " << rank << "] Send/Recv count comparison:" << std::endl; - for (int i = 0; i < element_communication_plan.num_send_ranks; i++) { - int dest_rank = element_communication_plan.send_rank_ids.host(i); - int send_count = elems_to_send_by_rank_rr.stride_host(i); - std::cout << " To rank " << dest_rank << ": sending " << send_count << " elements" << std::endl; - } - for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) { - int src_rank = element_communication_plan.recv_rank_ids.host(i); - int recv_count = elems_to_recv_by_rank_rr.stride_host(i); - std::cout << " From rank " << src_rank << ": expecting " << recv_count << " elements" << std::endl; - } - } - element_communication_plan.setup_send_recv(elems_to_send_by_rank_rr, elems_to_recv_by_rank_rr); MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout << " Finished building the send and recv counts and displacements for element communication" << std::endl; - MPI_Barrier(MPI_COMM_WORLD); // ****************************************************************************************** // Test element communication using MPI_Neighbor_alltoallv @@ -2119,7 +1926,6 @@ void partition_mesh( // Initialize the gauss point fields on each rank // Set owned elements to rank number, ghost elements to -1 (to verify communication) for (int i = 0; i < final_mesh.num_owned_elems; i++) { - // if(rank == 0) std::cout << " Setting owned element " << i << " to rank " << rank << std::endl; gauss_point.fields.host(i) = static_cast(rank); gauss_point.fields_vec.host(i, 0) = static_cast(rank); gauss_point.fields_vec.host(i, 1) = static_cast(rank); diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp index b14ee9cd..1106d99f 100644 --- a/examples/mesh_decomp/mesh_decomp.cpp +++ b/examples/mesh_decomp/mesh_decomp.cpp @@ -34,7 +34,7 @@ int main(int argc, char** argv) { // Mesh size double origin[3] = {0.0, 0.0, 0.0}; double length[3] = {1.0, 1.0, 1.0}; - int num_elems_dim[3] = {50, 50, 50}; + int num_elems_dim[3] = {180, 180, 180}; // Initial mesh built on rank zero Mesh_t initial_mesh; @@ -54,15 +54,24 @@ int main(int argc, char** argv) { if (rank == 0) { std::cout<<"World size: "< Date: Fri, 7 Nov 2025 16:47:28 -0600 Subject: [PATCH 25/52] ENH: Attempting to simplify building ghost and having a bad time --- examples/mesh_decomp/decomp_utils.h | 248 +++++++++++++-------------- examples/mesh_decomp/mesh_decomp.cpp | 5 +- 2 files changed, 120 insertions(+), 133 deletions(-) diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h index bab3f35d..3d50e682 100644 --- a/examples/mesh_decomp/decomp_utils.h +++ b/examples/mesh_decomp/decomp_utils.h @@ -63,7 +63,7 @@ void naive_partition_mesh( // Compute elements to send to each rank; handle remainders for non-even distribution std::fill(elems_per_rank.begin(), elems_per_rank.end(), initial_mesh.num_elems / world_size); int remainder = initial_mesh.num_elems % world_size; - for (int i = 0; i < remainder; ++i) { + for (int i = 0; i < remainder; i++) { elems_per_rank[i] += 1; } } @@ -529,6 +529,8 @@ void naive_partition_mesh( return; } + + void partition_mesh( Mesh_t& initial_mesh, Mesh_t& final_mesh, @@ -658,7 +660,7 @@ void partition_mesh( // --- Step 3: Fill in the CSR arrays, looping over each locally-owned element --- SCOTCH_Num offset = 0; // running count of edges encountered - for (size_t lid = 0; lid < naive_mesh.num_elems; ++lid) { + for (size_t lid = 0; lid < naive_mesh.num_elems; lid++) { // Record current edge offset for vertex lid in vertloctab vertloctab[lid] = offset; @@ -876,7 +878,7 @@ void partition_mesh( print_info = false; for(int rank_id = 0; rank_id < world_size; rank_id++) { if(rank_id == rank && print_info) { - for (size_t lid = 0; lid < naive_mesh.num_elems; ++lid) { + for (size_t lid = 0; lid < naive_mesh.num_elems; lid++) { size_t gid = naive_mesh.local_to_global_elem_mapping.host(lid); std::cout << "[rank " << rank_id << "] elem_local=" << lid << " gid=" << gid << " -> part=" << partloctab[lid] << "\n"; @@ -887,9 +889,6 @@ void partition_mesh( } print_info = false; - - - // ****************************************************************************************** // Build the final mesh from the repartition // ****************************************************************************************** @@ -902,7 +901,7 @@ void partition_mesh( // -------------- Phase 1: Determine elements to send to each rank -------------- std::vector> elems_to_send(world_size); - for (int lid = 0; lid < naive_mesh.num_elems; ++lid) { + for (int lid = 0; lid < naive_mesh.num_elems; lid++) { int dest = static_cast(partloctab[lid]); int elem_gid = static_cast(naive_mesh.local_to_global_elem_mapping.host(lid)); elems_to_send[dest].push_back(elem_gid); @@ -929,24 +928,24 @@ void partition_mesh( // Flatten send buffer - std::vector sendbuf; - sendbuf.reserve(send_total); + // send_elems: flattened list of element global IDs (GIDs) that this rank is sending to all other ranks. + // For each rank r, elems_to_send[r] contains the element GIDs that should be owned by rank r after repartitioning. + std::vector send_elems; + send_elems.reserve(send_total); for (int r = 0; r < world_size; ++r) - sendbuf.insert(sendbuf.end(), elems_to_send[r].begin(), elems_to_send[r].end()); + send_elems.insert(send_elems.end(), elems_to_send[r].begin(), elems_to_send[r].end()); - // Receive new local element GIDs - std::vector recvbuf(recv_total); - MPI_Alltoallv(sendbuf.data(), sendcounts.data(), sdispls.data(), MPI_INT, - recvbuf.data(), recvcounts.data(), rdispls.data(), MPI_INT, MPI_COMM_WORLD); + // new_elem_gids: receives the list of new element global IDs this rank will own after the exchange. + // It is filled after MPI_Alltoallv completes, and contains the GIDs for the elements new to (or remained on) this rank. + std::vector new_elem_gids(recv_total); + MPI_Alltoallv(send_elems.data(), sendcounts.data(), sdispls.data(), MPI_INT, + new_elem_gids.data(), recvcounts.data(), rdispls.data(), MPI_INT, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout<<" Finished exchanging element GIDs"< new_elem_gids = recvbuf; int num_new_elems = static_cast(new_elem_gids.size()); - if (print_info) { std::cout << "[rank " << rank << "] new elems: " << num_new_elems << std::endl; } @@ -956,12 +955,12 @@ void partition_mesh( // Flatten element-node connectivity by global node IDs std::vector conn_sendbuf; - for (int r = 0; r < world_size; ++r) { - for (int gid : elems_to_send[r]) { - // find local element lid from gid + for (int r = 0; r < world_size; r++) { + for (int elem_gid : elems_to_send[r]) { + // find local element lid from elem_gid int lid = -1; - for (int i = 0; i < naive_mesh.num_elems; ++i) - if (naive_mesh.local_to_global_elem_mapping.host(i) == gid) { lid = i; break; } + for (int i = 0; i < naive_mesh.num_elems; i++) + if (naive_mesh.local_to_global_elem_mapping.host(i) == elem_gid) { lid = i; break; } for (int j = 0; j < nodes_per_elem; j++) { int node_lid = naive_mesh.nodes_in_elem.host(lid, j); @@ -973,7 +972,7 @@ void partition_mesh( // element-node connectivity counts (ints per dest rank) std::vector conn_sendcounts(world_size), conn_recvcounts(world_size); - for (int r = 0; r < world_size; ++r) + for (int r = 0; r < world_size; r++) conn_sendcounts[r] = sendcounts[r] * nodes_per_elem; MPI_Alltoall(conn_sendcounts.data(), 1, MPI_INT, conn_recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD); @@ -1005,7 +1004,7 @@ void partition_mesh( // Build map gid→lid std::unordered_map node_gid_to_lid; - for (int i = 0; i < num_new_nodes; ++i) + for (int i = 0; i < num_new_nodes; i++) node_gid_to_lid[new_node_gids[i]] = i; if (print_info) @@ -1017,7 +1016,7 @@ void partition_mesh( for (int r = 0; r < world_size; ++r) { for (int gid : elems_to_send[r]) { int lid = -1; - for (int i = 0; i < naive_mesh.num_elems; ++i) + for (int i = 0; i < naive_mesh.num_elems; i++) if (naive_mesh.local_to_global_elem_mapping.host(i) == gid) { lid = i; break; } for (int j = 0; j < nodes_per_elem; j++) { @@ -1063,9 +1062,9 @@ void partition_mesh( intermediate_mesh.local_to_global_elem_mapping = DCArrayKokkos(num_new_elems); // Fill global mappings - for (int i = 0; i < num_new_nodes; ++i) + for (int i = 0; i < num_new_nodes; i++) intermediate_mesh.local_to_global_node_mapping.host(i) = new_node_gids[i]; - for (int i = 0; i < num_new_elems; ++i) + for (int i = 0; i < num_new_elems; i++) intermediate_mesh.local_to_global_elem_mapping.host(i) = new_elem_gids[i]; intermediate_mesh.local_to_global_node_mapping.update_device(); @@ -1075,10 +1074,10 @@ void partition_mesh( MPI_Barrier(MPI_COMM_WORLD); if(rank == 0) std::cout<<" Starting reverse mapping of the element-node connectivity from the global node ids to the local node ids"<> node_gid_to_coords; int coord_idx = 0; - for (int e = 0; e < num_new_elems; ++e) { - for (int j = 0; j < nodes_per_elem; j++) { - int node_gid = conn_recvbuf[e * nodes_per_elem + j]; + for (int e = 0; e < intermediate_mesh.num_elems; ++e) { + for (int j = 0; j < intermediate_mesh.num_nodes_in_elem; j++) { + int node_gid = conn_recvbuf[e * intermediate_mesh.num_nodes_in_elem + j]; if (node_gid_to_coords.find(node_gid) == node_gid_to_coords.end()) { node_gid_to_coords[node_gid] = { coord_recvbuf[coord_idx*3 + 0], @@ -1127,7 +1125,7 @@ void partition_mesh( // Now fill coordinates in node order intermediate_node.initialize(num_new_nodes, 3, {node_state::coords}); - for (int i = 0; i < num_new_nodes; ++i) { + for (int i = 0; i < num_new_nodes; i++) { int node_gid = new_node_gids[i]; auto it = node_gid_to_coords.find(node_gid); if (it != node_gid_to_coords.end()) { @@ -1152,18 +1150,9 @@ void partition_mesh( // First, gather the number of elements each rank owns std::vector elem_counts(world_size); - - // int MPI_Allgather( - // const void* sendbuf, // Data to send from this process - // int sendcount, // Number of elements to send - // MPI_Datatype sendtype, // Type of send data - // void* recvbuf, // Buffer to receive all data - // int recvcount, // Number of elements to receive from each process - // MPI_Datatype recvtype, // Type of receive data - // MPI_Comm comm // Communicator - // ); MPI_Allgather(&intermediate_mesh.num_elems, 1, MPI_INT, elem_counts.data(), 1, MPI_INT, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); + // Compute displacements std::vector elem_displs(world_size); int total_elems = 0; @@ -1174,17 +1163,6 @@ void partition_mesh( // Gather all element GIDs from all ranks std::vector all_elem_gids(total_elems); - - // int MPI_Allgatherv( - // const void* sendbuf, // Data to send from this process - // int sendcount, // Number of elements THIS process sends - // MPI_Datatype sendtype, // Type of send data - // void* recvbuf, // Buffer to receive all data - // const int* recvcounts, // Array: number of elements from each process - // const int* displs, // Array: displacement for each process's data - // MPI_Datatype recvtype, // Type of receive data - // MPI_Comm comm // Communicator - // ); MPI_Allgatherv(intermediate_mesh.local_to_global_elem_mapping.host_pointer(), intermediate_mesh.num_elems, MPI_UNSIGNED_LONG_LONG, all_elem_gids.data(), elem_counts.data(), elem_displs.data(), MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); @@ -1192,36 +1170,32 @@ void partition_mesh( // Build a map: element GID -> owning rank std::map elem_gid_to_rank; for (int r = 0; r < world_size; ++r) { - for (int i = 0; i < elem_counts[r]; ++i) { + for (int i = 0; i < elem_counts[r]; i++) { size_t gid = all_elem_gids[elem_displs[r] + i]; elem_gid_to_rank[gid] = r; } } - // Strategy: Find ghost elements by checking neighbors of our boundary elements. - // A boundary element is one that has a neighbor owned by another rank. - // However, since build_connectivity() only includes locally-owned elements, - // we need to use a different approach: find elements on other ranks that share + // Strategy: Find elements on other ranks that share // nodes with our locally-owned elements. // First, collect all nodes that belong to our locally-owned elements std::set local_elem_nodes; - for (int lid = 0; lid < num_new_elems; ++lid) { - for (int j = 0; j < nodes_per_elem; j++) { - size_t node_lid = intermediate_mesh.nodes_in_elem.host(lid, j); - size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid); - local_elem_nodes.insert(node_gid); - } + + for(int node_rid = 0; node_rid < intermediate_mesh.num_nodes; node_rid++) { + size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_rid); + local_elem_nodes.insert(node_gid); } + // Now collect element-to-node connectivity to send to all ranks // Format: for each element, list its node GIDs (each entry is a pair: elem_gid, node_gid) std::vector elem_node_conn; int local_conn_size = 0; - for (int lid = 0; lid < num_new_elems; ++lid) { + for (int lid = 0; lid < intermediate_mesh.num_elems; lid++) { size_t elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(lid); - for (int j = 0; j < nodes_per_elem; j++) { + for (int j = 0; j < intermediate_mesh.num_nodes_in_elem; j++) { size_t node_lid = intermediate_mesh.nodes_in_elem.host(lid, j); size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid); elem_node_conn.push_back(elem_gid); @@ -1229,12 +1203,15 @@ void partition_mesh( } local_conn_size += nodes_per_elem * 2; // Each pair is 2 size_ts } + + // Exchange element-node connectivity with all ranks using Allgather // First, gather the sizes from each rank std::vector conn_sizes(world_size); MPI_Allgather(&local_conn_size, 1, MPI_INT, conn_sizes.data(), 1, MPI_INT, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); + // Compute displacements std::vector conn_displs(world_size); int total_conn = 0; @@ -1249,9 +1226,40 @@ void partition_mesh( all_conn.data(), conn_sizes.data(), conn_displs.data(), MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); + + + DCArrayKokkos local_nodes_in_elem(intermediate_mesh.num_elems, intermediate_mesh.num_nodes_in_elem); + DCArrayKokkos all_nodes_in_elem(total_elems, intermediate_mesh.num_nodes_in_elem); + + std::vector mtr_conn_sizes(world_size); + + + local_nodes_in_elem = intermediate_mesh.nodes_in_elem; + int mtr_size = intermediate_mesh.num_elems * intermediate_mesh.num_nodes_in_elem; + + MPI_Allgather(&mtr_size, 1, MPI_INT, mtr_conn_sizes.data(), 1, MPI_INT, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + + // Compute displacements + std::vector mtr_conn_displs(world_size); + int total_mtr_conn = 0; + for (int r = 0; r < world_size; ++r) { + mtr_conn_displs[r] = total_mtr_conn; + total_mtr_conn += mtr_conn_sizes[r]; + } + + + MPI_Allgatherv(local_nodes_in_elem.host_pointer(), mtr_size, MPI_UNSIGNED_LONG_LONG, + all_nodes_in_elem.host_pointer(), mtr_conn_sizes.data(), mtr_conn_displs.data(), + MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + + + + // create a set for local_elem_gids std::set local_elem_gids; - for (int i = 0; i < num_new_elems; ++i) { + for (int i = 0; i < intermediate_mesh.num_elems; i++) { local_elem_gids.insert(intermediate_mesh.local_to_global_elem_mapping.host(i)); } @@ -1261,7 +1269,7 @@ void partition_mesh( if (r == rank) continue; // Skip our own data // Process pairs from rank r: conn_sizes[r] is in units of size_ts, so num_pairs = conn_sizes[r] / 2 int num_pairs = conn_sizes[r] / 2; - for (int i = 0; i < num_pairs; ++i) { + for (int i = 0; i < num_pairs; i++) { // Each pair is 2 size_ts, starting at conn_displs[r] int offset = conn_displs[r] + i * 2; size_t elem_gid = all_conn[offset]; @@ -1288,24 +1296,25 @@ void partition_mesh( // Additional check: elements that are neighbors of our locally-owned elements // but are owned by other ranks (these might already be in ghost_elem_gids, but check connectivity) - for (int lid = 0; lid < num_new_elems; ++lid) { - size_t num_neighbors = intermediate_mesh.num_elems_in_elem(lid); + // for (int lid = 0; lid < num_new_elems; lid++) { + // size_t num_neighbors = intermediate_mesh.num_elems_in_elem(lid); - for (size_t nbr_idx = 0; nbr_idx < num_neighbors; ++nbr_idx) { - size_t neighbor_lid = intermediate_mesh.elems_in_elem(lid, nbr_idx); + // for (size_t nbr_idx = 0; nbr_idx < num_neighbors; ++nbr_idx) { + // size_t neighbor_lid = intermediate_mesh.elems_in_elem(lid, nbr_idx); - if (neighbor_lid < static_cast(num_new_elems)) { - size_t neighbor_gid = intermediate_mesh.local_to_global_elem_mapping(neighbor_lid); + // if (neighbor_lid < static_cast(num_new_elems)) { + // size_t neighbor_gid = intermediate_mesh.local_to_global_elem_mapping(neighbor_lid); - // Check if neighbor is owned by this rank - auto it = elem_gid_to_rank.find(neighbor_gid); - if (it != elem_gid_to_rank.end() && it->second != rank) { - // Neighbor is owned by another rank - it's a ghost for us - ghost_elem_gids.insert(neighbor_gid); - } - } - } - } + // // Check if neighbor is owned by this rank + // auto it = elem_gid_to_rank.find(neighbor_gid); + // if (it != elem_gid_to_rank.end() && it->second != rank) { + // // Neighbor is owned by another rank - it's a ghost for us + // std::cout << "[rank " << rank << "] found ghost element " << neighbor_gid << std::endl; + // ghost_elem_gids.insert(neighbor_gid); + // } + // } + // } + // } // Count unique ghost elements intermediate_mesh.num_ghost_elems = ghost_elem_gids.size(); @@ -1317,32 +1326,6 @@ void partition_mesh( std::cout << " Finished calculating ghost elements" << std::endl; std::cout << " Ghost element calculation took " << (t_ghost_end - t_ghost_start) << " seconds." << std::endl; } - - // Print ghost element info if requested - print_info = false; - for(int i = 0; i < world_size; i++) { - MPI_Barrier(MPI_COMM_WORLD); - if(rank == i && print_info) { - std::cout << "[rank " << rank << "] owns " << num_new_elems - << " elements and has " << intermediate_mesh.num_ghost_elems << " ghost elements" << std::endl; - std::cout << "[rank " << rank << "] owned element global IDs: "; - for (int j = 0; j < intermediate_mesh.num_elems; j++) { - std::cout << intermediate_mesh.local_to_global_elem_mapping(j) << " "; - } - - // Print global IDs of ghost elements - std::cout << std::endl << "[rank " << rank << "] ghost element global IDs: "; - for (const auto& gid : ghost_elem_gids) { - std::cout << gid << " "; - } - std::cout << std::endl; - } - - MPI_Barrier(MPI_COMM_WORLD); - } - - - // Build the connectivity that includes ghost elements // Create an extended mesh with owned elements first, then ghost elements appended @@ -1353,17 +1336,18 @@ void partition_mesh( // Build a map: ghost_elem_gid -> vector of node_gids (ordered as in all_conn) std::map> ghost_elem_to_nodes; for (const size_t& ghost_gid : ghost_elem_gids) { - ghost_elem_to_nodes[ghost_gid].reserve(nodes_per_elem); + ghost_elem_to_nodes[ghost_gid].reserve(intermediate_mesh.num_nodes_in_elem); } // Extract nodes for each ghost element from all_conn // The all_conn array has pairs (elem_gid, node_gid) for each rank's elements for (int r = 0; r < world_size; ++r) { if (r == rank) continue; // Skip our own data (we already have owned element connectivity) + int num_pairs = conn_sizes[r] / 2; // Process pairs in order - each element's nodes are contiguous - for (int i = 0; i < num_pairs; ++i) { + for (int i = 0; i < num_pairs; i++) { int offset = conn_displs[r] + i * 2; size_t elem_gid = all_conn[offset]; size_t node_gid = all_conn[offset + 1]; @@ -1378,9 +1362,9 @@ void partition_mesh( // Verify each ghost element has the correct number of nodes for (auto& pair : ghost_elem_to_nodes) { - if (pair.second.size() != static_cast(nodes_per_elem)) { + if (pair.second.size() != static_cast(intermediate_mesh.num_nodes_in_elem)) { std::cerr << "[rank " << rank << "] ERROR: Ghost element " << pair.first - << " has " << pair.second.size() << " nodes, expected " << nodes_per_elem << std::endl; + << " has " << pair.second.size() << " nodes, expected " << intermediate_mesh.num_nodes_in_elem << std::endl; } } @@ -1390,7 +1374,7 @@ void partition_mesh( int extended_node_lid = 0; // Add all owned nodes - for (int i = 0; i < intermediate_mesh.num_nodes; ++i) { + for (int i = 0; i < intermediate_mesh.num_nodes; i++) { size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(i); node_gid_to_extended_lid[node_gid] = extended_node_lid++; } @@ -1448,7 +1432,7 @@ void partition_mesh( std::vector> extended_nodes_in_elem(total_extended_elems); // Copy owned element connectivity (convert to extended node LIDs) - for (int lid = 0; lid < intermediate_mesh.num_elems; ++lid) { + for (int lid = 0; lid < intermediate_mesh.num_elems; lid++) { extended_nodes_in_elem[lid].reserve(nodes_per_elem); for (int j = 0; j < nodes_per_elem; j++) { size_t node_lid = intermediate_mesh.nodes_in_elem.host(lid, j); @@ -1505,17 +1489,17 @@ void partition_mesh( // Build extended element GID list: owned first, then ghost std::vector extended_lid_to_elem_gid(total_extended_elems); // Owned elements - for (int i = 0; i < intermediate_mesh.num_elems; ++i) { + for (int i = 0; i < intermediate_mesh.num_elems; i++) { extended_lid_to_elem_gid[i] = intermediate_mesh.local_to_global_elem_mapping.host(i); } // Ghost elements (in sorted order) - for (size_t idx = 0; idx < ghost_elem_gids_ordered.size(); ++idx) { - extended_lid_to_elem_gid[intermediate_mesh.num_elems + idx] = ghost_elem_gids_ordered[idx]; + for (size_t i = 0; i < ghost_elem_gids_ordered.size(); i++) { + extended_lid_to_elem_gid[intermediate_mesh.num_elems + i] = ghost_elem_gids_ordered[i]; } // Build array: for each ghost element, store which rank owns it (where to receive data from) std::vector ghost_elem_owner_ranks(ghost_elem_gids_ordered.size()); - for (size_t i = 0; i < ghost_elem_gids_ordered.size(); ++i) { + for (size_t i = 0; i < ghost_elem_gids_ordered.size(); i++) { size_t ghost_gid = ghost_elem_gids_ordered[i]; auto it = elem_gid_to_rank.find(ghost_gid); if (it != elem_gid_to_rank.end()) { @@ -1529,7 +1513,7 @@ void partition_mesh( // Create a std::set of all the ranks this rank will receive data from std::set ghost_elem_receive_ranks; - for (size_t i = 0; i < ghost_elem_gids_ordered.size(); ++i) { + for (size_t i = 0; i < ghost_elem_gids_ordered.size(); i++) { ghost_elem_receive_ranks.insert(ghost_elem_owner_ranks[i]); } @@ -1595,13 +1579,13 @@ void partition_mesh( // 1. Build list of all global node IDs needed on this rank (owned + ghosts) std::vector all_needed_node_gids(total_extended_nodes); - for (int i = 0; i < total_extended_nodes; ++i) { + for (int i = 0; i < total_extended_nodes; i++) { all_needed_node_gids[i] = final_mesh.local_to_global_node_mapping.host(i); } // 2. Build owned node GIDs and their coordinates std::vector owned_gids(final_mesh.num_owned_nodes); - for (int i = 0; i < final_mesh.num_owned_nodes; ++i) + for (int i = 0; i < final_mesh.num_owned_nodes; i++) owned_gids[i] = final_mesh.local_to_global_node_mapping.host(i); // 3. Gather all GIDs in the world that are needed anywhere (owned or ghosted, by any rank) @@ -1633,7 +1617,7 @@ void partition_mesh( // d) Global coords (size: total_owned x 3) std::vector owned_coords_send(3*local_owned_count, 0.0); - for (int i=0; i coord[3] std::unordered_map> gid_to_coord; - for (int i=0; i xyz = { all_owned_coords[3*i+0], all_owned_coords[3*i+1], @@ -1664,7 +1648,7 @@ void partition_mesh( } // 4. Finally, fill final_node.coords with correct coordinates. - for (int i = 0; i < total_extended_nodes; ++i) { + for (int i = 0; i < total_extended_nodes; i++) { size_t gid = final_mesh.local_to_global_node_mapping.host(i); auto it = gid_to_coord.find(gid); if (it != gid_to_coord.end()) { @@ -1694,7 +1678,7 @@ void partition_mesh( // Prepare local ghost list as vector std::vector ghost_gids_vec; ghost_gids_vec.reserve(final_mesh.num_ghost_elems); - for (int i = 0; i < final_mesh.num_ghost_elems; ++i) { + for (int i = 0; i < final_mesh.num_ghost_elems; i++) { ghost_gids_vec.push_back(final_mesh.local_to_global_elem_mapping.host(final_mesh.num_owned_elems + i)); // Ghost elements are after the owned elements in the global element mapping } @@ -1724,7 +1708,7 @@ void partition_mesh( for (int r = 0; r < world_size; ++r) { int cnt = ghost_counts[r]; int off = ghost_displs[r]; - for (int i = 0; i < cnt; ++i) { + for (int i = 0; i < cnt; i++) { size_t g = all_ghost_gids[off + i]; gid_to_ghosting_ranks[g].push_back(r); } @@ -2014,7 +1998,7 @@ void partition_mesh( // for (int r = 0; r < world_size; ++r) { // int cnt = ghost_node_counts[r]; // int off = ghost_node_displs[r]; - // for (int i = 0; i < cnt; ++i) { + // for (int i = 0; i < cnt; i++) { // size_t g = all_ghost_node_gids[off + i]; // node_gid_to_ghosting_ranks[g].push_back(r); // } diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp index 1106d99f..7de5d847 100644 --- a/examples/mesh_decomp/mesh_decomp.cpp +++ b/examples/mesh_decomp/mesh_decomp.cpp @@ -34,7 +34,7 @@ int main(int argc, char** argv) { // Mesh size double origin[3] = {0.0, 0.0, 0.0}; double length[3] = {1.0, 1.0, 1.0}; - int num_elems_dim[3] = {180, 180, 180}; + int num_elems_dim[3] = {100, 100, 100}; // Initial mesh built on rank zero Mesh_t initial_mesh; @@ -69,6 +69,9 @@ int main(int argc, char** argv) { double t_partition_start = MPI_Wtime(); partition_mesh(initial_mesh, final_mesh, initial_node, final_node, gauss_point, world_size, rank); double t_partition_end = MPI_Wtime(); + + + if(rank == 0) { printf("Mesh partitioning time: %.2f seconds\n", t_partition_end - t_partition_start); } From 8a7de21ee6ddf4f6e822cc5c34d35266d8e714d3 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Fri, 7 Nov 2025 16:59:29 -0600 Subject: [PATCH 26/52] DOC: Improving documentation of ghost --- examples/mesh_decomp/decomp_utils.h | 221 +++++++++++++++++++--------- 1 file changed, 152 insertions(+), 69 deletions(-) diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h index 3d50e682..ff4f87cd 100644 --- a/examples/mesh_decomp/decomp_utils.h +++ b/examples/mesh_decomp/decomp_utils.h @@ -849,7 +849,7 @@ void partition_mesh( // Use SCOTCH_STRATQUALITY for best cut quality. // To change: replace with SCOTCH_STRATDEFAULT, SCOTCH_STRATSPEED, or SCOTCH_STRATBALANCE as discussed above. // Arguments: (strategy object, strategy flag, #parts, recursion (0=auto), imbalance ratio) - SCOTCH_stratDgraphMapBuild(&stratdat, SCOTCH_STRATQUALITY, world_size, 0, 0.01); + SCOTCH_stratDgraphMapBuild(&stratdat, SCOTCH_STRATQUALITY, world_size, 0, 0.001); // partloctab: output array mapping each local element (vertex) to a *target partition number* // After partitioning, partloctab[i] gives the part-assignment (in [0,world_size-1]) for local element i. @@ -1144,16 +1144,71 @@ void partition_mesh( // ****************************************************************************************** // Build the ghost elements and nodes -// ****************************************************************************************** +// ================================================================================================** +// +// OVERVIEW OF GHOST ELEMENT IDENTIFICATION: +// ========================================== +// In distributed memory parallel computing with MPI, each processor (rank) owns a subset of mesh +// elements. However, to perform computations that depend on element neighbors or to maintain +// consistency at domain boundaries, we need ghost elements: copies of elements from neighboring +// ranks that share nodes with our locally-owned elements. +// +// This algorithm identifies and extracts ghost element data in 5 steps: +// 1. Gather ownership information: Which rank owns which elements (via MPI_Allgatherv) +// 2. Collect local element-node connectivity for distribution +// 3. Broadcast connectivity to all ranks (via MPI_Allgatherv) +// 4. Identify which remote elements touch our local elements +// 5. Extract the full connectivity data for identified ghost elements +// +// KEY DATA STRUCTURES: +// - elem_gid_to_rank: Map from element global ID to owning rank +// - all_elem_gids: Every element GID from every rank (on every rank) +// - all_conn: Flattened (elem_gid, node_gid) pairs from every rank (on every rank) +// - ghost_elem_gids: Set of remote element GIDs that are ghosts for this rank +// - ghost_elem_to_nodes: Map from ghost element GID to its node GIDs +// +// WHY THIS APPROACH? +// - MPI_Allgatherv is efficient for gathering all data to all ranks +// - Connectivity pairs allow flexible reconstruction of element-node relationships +// - Using sets and maps for efficient lookups (O(log n) instead of O(n)) +// - Distributed computation avoids a single bottleneck rank +// double t_ghost_start = MPI_Wtime(); - // First, gather the number of elements each rank owns + // ======================================================================== + // STEP 1: Gather element ownership information from all ranks + // ======================================================================== + // In a distributed mesh, each rank owns a subset of elements. To identify + // ghost elements (elements from other ranks needed by this rank), we need + // to know which rank owns each element. This section uses MPI collective + // operations to gather element GID ownership information. + // + // MPI COLLECTIVE OPERATIONS EXPLAINED: + // ==================================== + // - MPI_Barrier: Synchronizes all ranks; waits until all ranks reach this point + // - MPI_Allgather: Each rank sends one item of data; each rank receives one item from each rank + // Input: Each rank provides local data + // Output: Every rank has data from every rank in order (rank 0's data, rank 1's data, ...) + // - MPI_Allgatherv: Like MPI_Allgather but for variable-sized data + // Input: Each rank provides data of potentially different sizes + // Output: Every rank has all data from all ranks, with displacement arrays specifying where each rank's data goes + // + // COMMUNICATION PATTERN VISUALIZATION: + // Rank 0: elem_count[0] ----> All ranks receive: [elem_count[0], elem_count[1], elem_count[2], ...] + // Rank 1: elem_count[1] / + // Rank 2: elem_count[2] / + + // MPI_Allgather: Each rank sends its element count, every rank receives + // the count from every other rank. Result: elem_counts[r] = number of + // elements owned by rank r. std::vector elem_counts(world_size); MPI_Allgather(&intermediate_mesh.num_elems, 1, MPI_INT, elem_counts.data(), 1, MPI_INT, MPI_COMM_WORLD); - MPI_Barrier(MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); // Synchronize all ranks before proceeding - // Compute displacements + // Compute displacements: offset into the global array for each rank's data + // Example: if elem_counts = [100, 150, 120], then + // elem_displs = [0, 100, 250] (where each rank's data starts in all_elem_gids) std::vector elem_displs(world_size); int total_elems = 0; for (int r = 0; r < world_size; ++r) { @@ -1161,13 +1216,18 @@ void partition_mesh( total_elems += elem_counts[r]; } - // Gather all element GIDs from all ranks + // MPI_Allgatherv: Gather variable-sized data from all ranks into one array + // Each rank contributes its local_to_global_elem_mapping, which maps + // local element indices to global element GIDs. After this call, + // all_elem_gids contains ALL element GIDs from all ranks, organized by rank. std::vector all_elem_gids(total_elems); MPI_Allgatherv(intermediate_mesh.local_to_global_elem_mapping.host_pointer(), intermediate_mesh.num_elems, MPI_UNSIGNED_LONG_LONG, all_elem_gids.data(), elem_counts.data(), elem_displs.data(), MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); - // Build a map: element GID -> owning rank + + // Build a lookup map: element GID -> owning rank + // This allows O(log n) lookups to determine which rank owns any given element. std::map elem_gid_to_rank; for (int r = 0; r < world_size; ++r) { for (int i = 0; i < elem_counts[r]; i++) { @@ -1176,43 +1236,66 @@ void partition_mesh( } } - // Strategy: Find elements on other ranks that share - // nodes with our locally-owned elements. + // ======================================================================== + // STEP 2: Build element-to-node connectivity for local elements + // ======================================================================== + // Ghost elements are elements from other ranks that share nodes with our + // locally-owned elements. To identify them, we need to exchange element-node + // connectivity information with all other ranks. - // First, collect all nodes that belong to our locally-owned elements + // Collect all nodes that belong to our locally-owned elements + // This set will be used later to check if a remote element is relevant std::set local_elem_nodes; - for(int node_rid = 0; node_rid < intermediate_mesh.num_nodes; node_rid++) { size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_rid); local_elem_nodes.insert(node_gid); } - - // Now collect element-to-node connectivity to send to all ranks - // Format: for each element, list its node GIDs (each entry is a pair: elem_gid, node_gid) + // ======================================================================== + // STEP 3: Exchange element-to-node connectivity via MPI_Allgatherv + // ======================================================================== + // Build a flattened connectivity array: pairs of (elem_gid, node_gid) + // Example for 2 elements with 8 nodes each: + // elem_node_conn = [elem0_gid, node0, elem0_gid, node1, ..., elem1_gid, node0, ...] + // + // This format is chosen because it's easy to serialize and deserialize over MPI, + // and allows us to reconstruct the full element-node relationships. std::vector elem_node_conn; int local_conn_size = 0; + // For each locally-owned element, record its GID and all its node GIDs for (int lid = 0; lid < intermediate_mesh.num_elems; lid++) { size_t elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(lid); + + // Access nodes_in_elem[lid][*] to get all nodes in this element for (int j = 0; j < intermediate_mesh.num_nodes_in_elem; j++) { - size_t node_lid = intermediate_mesh.nodes_in_elem.host(lid, j); - size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid); + size_t node_lid = intermediate_mesh.nodes_in_elem.host(lid, j); // Local index + size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid); // Global index + elem_node_conn.push_back(elem_gid); elem_node_conn.push_back(node_gid); } - local_conn_size += nodes_per_elem * 2; // Each pair is 2 size_ts + local_conn_size += nodes_per_elem * 2; // Each element contributes (num_nodes_in_elem * 2) size_ts } - // Exchange element-node connectivity with all ranks using Allgather - // First, gather the sizes from each rank + // ======================================================================== + // Perform MPI communication to gather connectivity from all ranks + // ======================================================================== + // Similar to Step 1, we use MPI_Allgatherv to collect all element-node + // connectivity pairs. This is a two-stage process: + // 1) Gather the size of each rank's connectivity data + // 2) Gather the actual connectivity data with proper offsets + + // Stage 1: Gather connectivity sizes from each rank + // conn_sizes[r] = number of size_t values that rank r will send std::vector conn_sizes(world_size); MPI_Allgather(&local_conn_size, 1, MPI_INT, conn_sizes.data(), 1, MPI_INT, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); - // Compute displacements + // Compute displacements for the second MPI_Allgatherv call + // Displcements tell each rank where its data should be placed in the global array std::vector conn_displs(world_size); int total_conn = 0; for (int r = 0; r < world_size; ++r) { @@ -1220,72 +1303,59 @@ void partition_mesh( total_conn += conn_sizes[r]; } - // Gather all element-node pairs from all ranks + // Stage 2: Gather all element-node connectivity data + // After this call, all_conn contains the flattened connectivity from every rank, + // organized by rank. Access data from rank r using indices [conn_displs[r], conn_displs[r] + conn_sizes[r]) std::vector all_conn(total_conn); MPI_Allgatherv(elem_node_conn.data(), local_conn_size, MPI_UNSIGNED_LONG_LONG, all_conn.data(), conn_sizes.data(), conn_displs.data(), MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); - - - DCArrayKokkos local_nodes_in_elem(intermediate_mesh.num_elems, intermediate_mesh.num_nodes_in_elem); - DCArrayKokkos all_nodes_in_elem(total_elems, intermediate_mesh.num_nodes_in_elem); - - std::vector mtr_conn_sizes(world_size); - - local_nodes_in_elem = intermediate_mesh.nodes_in_elem; - int mtr_size = intermediate_mesh.num_elems * intermediate_mesh.num_nodes_in_elem; - - MPI_Allgather(&mtr_size, 1, MPI_INT, mtr_conn_sizes.data(), 1, MPI_INT, MPI_COMM_WORLD); - MPI_Barrier(MPI_COMM_WORLD); - - // Compute displacements - std::vector mtr_conn_displs(world_size); - int total_mtr_conn = 0; - for (int r = 0; r < world_size; ++r) { - mtr_conn_displs[r] = total_mtr_conn; - total_mtr_conn += mtr_conn_sizes[r]; - } - - - MPI_Allgatherv(local_nodes_in_elem.host_pointer(), mtr_size, MPI_UNSIGNED_LONG_LONG, - all_nodes_in_elem.host_pointer(), mtr_conn_sizes.data(), mtr_conn_displs.data(), - MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); - MPI_Barrier(MPI_COMM_WORLD); - - - + // ======================================================================== + // STEP 4: Identify ghost elements + // ======================================================================== + // A ghost element is an element owned by another rank that shares at least + // one node with our locally-owned elements. This step identifies all such elements. - // create a set for local_elem_gids + // Build a set of locally-owned element GIDs for quick lookup std::set local_elem_gids; for (int i = 0; i < intermediate_mesh.num_elems; i++) { local_elem_gids.insert(intermediate_mesh.local_to_global_elem_mapping.host(i)); } - // Build a map: node GID -> set of element GIDs that contain it (from other ranks) + // Build a temporary map: node GID -> set of element GIDs (from other ranks) that contain it + // This helps us identify which remote elements are adjacent to our local elements std::map> node_to_ext_elem; + + // Iterate through connectivity data from each rank (except ourselves) for (int r = 0; r < world_size; ++r) { - if (r == rank) continue; // Skip our own data - // Process pairs from rank r: conn_sizes[r] is in units of size_ts, so num_pairs = conn_sizes[r] / 2 + if (r == rank) continue; // Skip our own data - we already know our elements + + // Parse the connectivity data for rank r + // Data format: [elem0_gid, node0, elem0_gid, node1, ..., elem1_gid, node0, ...] + // Each pair is 2 size_ts, so num_pairs = conn_sizes[r] / 2 int num_pairs = conn_sizes[r] / 2; + for (int i = 0; i < num_pairs; i++) { - // Each pair is 2 size_ts, starting at conn_displs[r] + // Offset into all_conn for this pair (elem_gid, node_gid) int offset = conn_displs[r] + i * 2; size_t elem_gid = all_conn[offset]; size_t node_gid = all_conn[offset + 1]; - // If this node is in one of our elements, then the element is a potential ghost + // Check if this node belongs to one of our locally-owned elements if (local_elem_nodes.find(node_gid) != local_elem_nodes.end()) { - // Check if this element is not owned by us + // Check if this element is NOT owned by us (i.e., it's from another rank) if (local_elem_gids.find(elem_gid) == local_elem_gids.end()) { + // This is a ghost element for us node_to_ext_elem[node_gid].insert(elem_gid); } } } } - // Collect all unique ghost element GIDs + // Extract all unique ghost element GIDs + // We use a set to eliminate duplicates (same ghost element might share multiple nodes with us) std::set ghost_elem_gids; for (const auto& pair : node_to_ext_elem) { for (size_t elem_gid : pair.second) { @@ -1316,7 +1386,7 @@ void partition_mesh( // } // } - // Count unique ghost elements + // Store the count of ghost elements for later use intermediate_mesh.num_ghost_elems = ghost_elem_gids.size(); MPI_Barrier(MPI_COMM_WORLD); @@ -1326,33 +1396,43 @@ void partition_mesh( std::cout << " Finished calculating ghost elements" << std::endl; std::cout << " Ghost element calculation took " << (t_ghost_end - t_ghost_start) << " seconds." << std::endl; } - // Build the connectivity that includes ghost elements - // Create an extended mesh with owned elements first, then ghost elements appended + + // ======================================================================== + // STEP 5: Extract ghost element connectivity + // ======================================================================== + // Now that we know which elements are ghosts, we need to extract their + // full node connectivity from all_conn. This allows us to properly construct + // the extended mesh with ghost elements included. MPI_Barrier(MPI_COMM_WORLD); if(rank == 0) std::cout << " Starting to build extended mesh with ghost elements" << std::endl; - // Step 1: Extract ghost element-node connectivity from all_conn - // Build a map: ghost_elem_gid -> vector of node_gids (ordered as in all_conn) + // Build a map: ghost_elem_gid -> vector of node_gids + // We pre-allocate the vector size to avoid repeated reallocations std::map> ghost_elem_to_nodes; for (const size_t& ghost_gid : ghost_elem_gids) { ghost_elem_to_nodes[ghost_gid].reserve(intermediate_mesh.num_nodes_in_elem); } - // Extract nodes for each ghost element from all_conn - // The all_conn array has pairs (elem_gid, node_gid) for each rank's elements + // ======================================================================== + // Extract nodes for each ghost element from the globally-collected all_conn + // ======================================================================== + // The all_conn array was populated by MPI_Allgatherv and contains connectivity + // pairs (elem_gid, node_gid) for all elements from all ranks. We now parse + // this data to extract the nodes for each ghost element. for (int r = 0; r < world_size; ++r) { - if (r == rank) continue; // Skip our own data (we already have owned element connectivity) + if (r == rank) continue; // Skip our own data - we already have owned element connectivity + // Parse connectivity data for rank r int num_pairs = conn_sizes[r] / 2; - // Process pairs in order - each element's nodes are contiguous for (int i = 0; i < num_pairs; i++) { + // Calculate offset for this pair: displacement + (pair_index * 2) int offset = conn_displs[r] + i * 2; size_t elem_gid = all_conn[offset]; size_t node_gid = all_conn[offset + 1]; - // If this is one of our ghost elements, record its node (in order) + // If this element is one of our identified ghost elements, record its node auto it = ghost_elem_to_nodes.find(elem_gid); if (it != ghost_elem_to_nodes.end()) { it->second.push_back(node_gid); @@ -1360,7 +1440,10 @@ void partition_mesh( } } - // Verify each ghost element has the correct number of nodes + // ======================================================================== + // Validation: Verify each ghost element has the correct number of nodes + // ======================================================================== + // This catch detects issues in the MPI communication or parsing logic for (auto& pair : ghost_elem_to_nodes) { if (pair.second.size() != static_cast(intermediate_mesh.num_nodes_in_elem)) { std::cerr << "[rank " << rank << "] ERROR: Ghost element " << pair.first From accd023660cf07719e661c7ce676ea0c0bd82999 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Mon, 10 Nov 2025 11:01:12 -0600 Subject: [PATCH 27/52] STYLE: Tidying up, and testing with vtk read mesh --- examples/mesh_decomp/decomp_utils.h | 59 +++---- examples/mesh_decomp/mesh_decomp.cpp | 4 + examples/mesh_decomp/mesh_io.h | 254 +++++++++++++++++++++++++++ 3 files changed, 287 insertions(+), 30 deletions(-) diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h index ff4f87cd..dada0d99 100644 --- a/examples/mesh_decomp/decomp_utils.h +++ b/examples/mesh_decomp/decomp_utils.h @@ -909,7 +909,7 @@ void partition_mesh( // -------------- Phase 2: Exchange element GIDs -------------- std::vector sendcounts(world_size), recvcounts(world_size); - for (int r = 0; r < world_size; ++r) + for (int r = 0; r < world_size; r++) sendcounts[r] = static_cast(elems_to_send[r].size()); MPI_Alltoall(sendcounts.data(), 1, MPI_INT, recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD); @@ -919,7 +919,7 @@ void partition_mesh( // Compute displacements std::vector sdispls(world_size), rdispls(world_size); int send_total = 0, recv_total = 0; - for (int r = 0; r < world_size; ++r) { + for (int r = 0; r < world_size; r++) { sdispls[r] = send_total; rdispls[r] = recv_total; send_total += sendcounts[r]; @@ -932,7 +932,7 @@ void partition_mesh( // For each rank r, elems_to_send[r] contains the element GIDs that should be owned by rank r after repartitioning. std::vector send_elems; send_elems.reserve(send_total); - for (int r = 0; r < world_size; ++r) + for (int r = 0; r < world_size; r++) send_elems.insert(send_elems.end(), elems_to_send[r].begin(), elems_to_send[r].end()); // new_elem_gids: receives the list of new element global IDs this rank will own after the exchange. @@ -983,7 +983,7 @@ void partition_mesh( std::vector conn_sdispls(world_size), conn_rdispls(world_size); int conn_send_total = 0, conn_recv_total = 0; - for (int r = 0; r < world_size; ++r) { + for (int r = 0; r < world_size; r++) { conn_sdispls[r] = conn_send_total; conn_rdispls[r] = conn_recv_total; conn_send_total += conn_sendcounts[r]; @@ -1013,7 +1013,7 @@ void partition_mesh( // -------------- Phase 5: Request node coordinates -------------- std::vector node_coords_sendbuf; - for (int r = 0; r < world_size; ++r) { + for (int r = 0; r < world_size; r++) { for (int gid : elems_to_send[r]) { int lid = -1; for (int i = 0; i < naive_mesh.num_elems; i++) @@ -1032,7 +1032,7 @@ void partition_mesh( // Each node is 3 doubles; same sendcounts scaling applies std::vector coord_sendcounts(world_size), coord_recvcounts(world_size); - for (int r = 0; r < world_size; ++r) + for (int r = 0; r < world_size; r++) coord_sendcounts[r] = sendcounts[r] * nodes_per_elem * 3; MPI_Alltoall(coord_sendcounts.data(), 1, MPI_INT, coord_recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD); @@ -1041,7 +1041,7 @@ void partition_mesh( std::vector coord_sdispls(world_size), coord_rdispls(world_size); int coord_send_total = 0, coord_recv_total = 0; - for (int r = 0; r < world_size; ++r) { + for (int r = 0; r < world_size; r++) { coord_sdispls[r] = coord_send_total; coord_rdispls[r] = coord_recv_total; coord_send_total += coord_sendcounts[r]; @@ -1211,7 +1211,7 @@ void partition_mesh( // elem_displs = [0, 100, 250] (where each rank's data starts in all_elem_gids) std::vector elem_displs(world_size); int total_elems = 0; - for (int r = 0; r < world_size; ++r) { + for (int r = 0; r < world_size; r++) { elem_displs[r] = total_elems; total_elems += elem_counts[r]; } @@ -1229,7 +1229,7 @@ void partition_mesh( // Build a lookup map: element GID -> owning rank // This allows O(log n) lookups to determine which rank owns any given element. std::map elem_gid_to_rank; - for (int r = 0; r < world_size; ++r) { + for (int r = 0; r < world_size; r++) { for (int i = 0; i < elem_counts[r]; i++) { size_t gid = all_elem_gids[elem_displs[r] + i]; elem_gid_to_rank[gid] = r; @@ -1298,7 +1298,7 @@ void partition_mesh( // Displcements tell each rank where its data should be placed in the global array std::vector conn_displs(world_size); int total_conn = 0; - for (int r = 0; r < world_size; ++r) { + for (int r = 0; r < world_size; r++) { conn_displs[r] = total_conn; total_conn += conn_sizes[r]; } @@ -1329,7 +1329,7 @@ void partition_mesh( std::map> node_to_ext_elem; // Iterate through connectivity data from each rank (except ourselves) - for (int r = 0; r < world_size; ++r) { + for (int r = 0; r < world_size; r++) { if (r == rank) continue; // Skip our own data - we already know our elements // Parse the connectivity data for rank r @@ -1390,12 +1390,7 @@ void partition_mesh( intermediate_mesh.num_ghost_elems = ghost_elem_gids.size(); MPI_Barrier(MPI_COMM_WORLD); - double t_ghost_end = MPI_Wtime(); - if (rank == 0) { - std::cout << " Finished calculating ghost elements" << std::endl; - std::cout << " Ghost element calculation took " << (t_ghost_end - t_ghost_start) << " seconds." << std::endl; - } // ======================================================================== // STEP 5: Extract ghost element connectivity @@ -1420,7 +1415,7 @@ void partition_mesh( // The all_conn array was populated by MPI_Allgatherv and contains connectivity // pairs (elem_gid, node_gid) for all elements from all ranks. We now parse // this data to extract the nodes for each ghost element. - for (int r = 0; r < world_size; ++r) { + for (int r = 0; r < world_size; r++) { if (r == rank) continue; // Skip our own data - we already have owned element connectivity // Parse connectivity data for rank r @@ -1545,7 +1540,7 @@ void partition_mesh( MPI_Barrier(MPI_COMM_WORLD); // Sequential rank-wise printing of extended mesh structure info if(print_info) { - for (int r = 0; r < world_size; ++r) { + for (int r = 0; r < world_size; r++) { MPI_Barrier(MPI_COMM_WORLD); if (rank == r) { std::cout << "[rank " << rank << "] Finished building extended mesh structure" << std::endl; @@ -1636,6 +1631,13 @@ void partition_mesh( MPI_Barrier(MPI_COMM_WORLD); + double t_ghost_end = MPI_Wtime(); + + if (rank == 0) { + std::cout << " Finished calculating ghost elements" << std::endl; + std::cout << " Ghost element calculation took " << (t_ghost_end - t_ghost_start) << " seconds." << std::endl; + } + final_mesh.nodes_in_elem.update_device(); final_mesh.build_connectivity(); @@ -1686,7 +1688,7 @@ void partition_mesh( // b) Displacements and total std::vector owned_displs(world_size,0); int total_owned = 0; - for (int r=0; r owned_coords_send(3*local_owned_count, 0.0); - for (int i=0; i coord_counts(world_size); std::vector coord_displs(world_size); - for (int r=0; r coord[3] std::unordered_map> gid_to_coord; - for (int i=0; i xyz = { all_owned_coords[3*i+0], all_owned_coords[3*i+1], @@ -1749,7 +1751,7 @@ void partition_mesh( // -------------------------------------------------------------------------------------- -// Build the send patterns for elements + // Build the send patterns for elements // Build reverse map via global IDs: for each local element gid, find ranks that ghost it. // Steps: // 1) Each rank contributes its ghost element GIDs. @@ -1773,7 +1775,7 @@ void partition_mesh( // Displacements and recv buffer std::vector ghost_displs(world_size, 0); int total_ghosts = 0; - for (int r = 0; r < world_size; ++r) { + for (int r = 0; r < world_size; r++) { ghost_displs[r] = total_ghosts; total_ghosts += ghost_counts[r]; } @@ -1788,7 +1790,7 @@ void partition_mesh( // Build map gid -> ranks that ghost it std::unordered_map> gid_to_ghosting_ranks; gid_to_ghosting_ranks.reserve(static_cast(total_ghosts)); - for (int r = 0; r < world_size; ++r) { + for (int r = 0; r < world_size; r++) { int cnt = ghost_counts[r]; int off = ghost_displs[r]; for (int i = 0; i < cnt; i++) { @@ -2057,7 +2059,7 @@ void partition_mesh( // // Displacements and recv buffer // std::vector ghost_node_displs(world_size, 0); // int total_ghost_nodes = 0; - // for (int r = 0; r < world_size; ++r) { + // for (int r = 0; r < world_size; r++) { // ghost_node_displs[r] = total_ghost_nodes; // total_ghost_nodes += ghost_node_counts[r]; // } @@ -2078,7 +2080,7 @@ void partition_mesh( // // Build map node_gid -> ranks that ghost it // std::unordered_map> node_gid_to_ghosting_ranks; // node_gid_to_ghosting_ranks.reserve(static_cast(total_ghost_nodes)); - // for (int r = 0; r < world_size; ++r) { + // for (int r = 0; r < world_size; r++) { // int cnt = ghost_node_counts[r]; // int off = ghost_node_displs[r]; // for (int i = 0; i < cnt; i++) { @@ -2136,9 +2138,6 @@ void partition_mesh( // MPI_Barrier(MPI_COMM_WORLD); // if(rank == 0) std::cout << " Finished building node communication reverse map" << std::endl; - - - } diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp index 7de5d847..88727e2e 100644 --- a/examples/mesh_decomp/mesh_decomp.cpp +++ b/examples/mesh_decomp/mesh_decomp.cpp @@ -58,6 +58,10 @@ int main(int argc, char** argv) { std::cout<<"Initializing mesh"< #include #include +#include +///////////////////////////////////////////////////////////////////////////// +/// +/// \fn split +/// +/// \brief Splits a string by a given delimiter +/// +/// \param Input string +/// \param delimiter +/// +/// \return Vector of split string values +/// +///////////////////////////////////////////////////////////////////////////// +inline std::vector split(std::string s, std::string delimiter) +{ + size_t pos_start = 0, pos_end, delim_len = delimiter.length(); + std::string token; + std::vector res; + + while ((pos_end = s.find(delimiter, pos_start)) != std::string::npos) { + token = s.substr(pos_start, pos_end - pos_start); + pos_start = pos_end + delim_len; + res.push_back(token); + } + res.push_back(s.substr(pos_start)); + return res; +} // end of split ///////////////////////////////////////////////////////////////////////////// /// @@ -777,4 +804,231 @@ void write_vtu(Mesh_t& mesh, } // end write_vtu + + ///////////////////////////////////////////////////////////////////////////// + /// + /// \fn read_vtk_mesh + /// + /// \brief Read ASCII .vtk mesh file + /// + /// \param Simulation mesh + /// \param Simulation state + /// \param Node state struct + /// \param Number of dimensions + /// + ///////////////////////////////////////////////////////////////////////////// + void read_vtk_mesh(Mesh_t& mesh, + node_t& node, + int num_dims, + std::string mesh_file_) +{ + + std::cout<<"Reading VTK mesh"< v = split (str, delimiter); + + // looking for the following text: + // POINTS %d float + if(v[0] == "POINTS"){ + size_t num_nodes = std::stoi(v[1]); + printf("Number of nodes read in %zu\n", num_nodes); + mesh.initialize_nodes(num_nodes); + + std::vector required_node_state = { node_state::coords }; + node.initialize(num_nodes, num_dims, required_node_state); + + found=true; + } // end if + + + if (i>1000){ + std::cerr << "ERROR: Failed to find POINTS in file" << std::endl; + break; + } // end if + + i++; + } // end while + + // read the node coordinates + for (node_gid=0; node_gid v = split (str, delimiter); + + // save the nodal coordinates + node.coords.host(node_gid, 0) = std::stod(v[0]); // double + node.coords.host(node_gid, 1) = std::stod(v[1]); // double + if(num_dims==3){ + node.coords.host(node_gid, 2) = std::stod(v[2]); // double + } + + } // end for nodes + + + // Update device nodal positions + node.coords.update_device(); + + + found=false; + + // look for CELLS + i = 0; + size_t num_elem = 0; + while (found==false) { + std::string str; + std::getline(in, str); + + std::string delimiter = " "; + std::vector v = split (str, delimiter); + std::cout << v[0] << std::endl; // printing + + // looking for the following text: + // CELLS num_elem size + if(v[0] == "CELLS"){ + num_elem = std::stoi(v[1]); + printf("Number of elements read in %zu\n", num_elem); + + // initialize elem variables + mesh.initialize_elems(num_elem, num_dims); + + found=true; + } // end if + + + if (i>1000){ + printf("ERROR: Failed to find CELLS \n"); + break; + } // end if + + i++; + } // end while + + + // read the node ids in the element + for (elem_gid=0; elem_gid v = split (str, delimiter); + num_nodes_in_elem = std::stoi(v[0]); + + for (size_t node_lid=0; node_lid v = split (str, delimiter); + + // looking for the following text: + // CELLS num_elem size + if(v[0] == "CELL_TYPES"){ + + std::getline(in, str); + elem_type = std::stoi(str); + + found=true; + } // end if + + + if (i>1000){ + printf("ERROR: Failed to find elem_TYPE \n"); + break; + } // end if + + i++; + } // end while + printf("Element type = %zu \n", elem_type); + // elem types: + // linear hex = 12, linear quad = 9 + found=false; + + + if(num_nodes_in_elem==8 & elem_type != 12) { + printf("Wrong element type of %zu \n", elem_type); + std::cerr << "ERROR: incorrect element type in VTK file" << std::endl; + } + + in.close(); + +} // end of VTKread function + #endif \ No newline at end of file From 4671064a27a0f01c4f4849be2df78667fa339c7d Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Mon, 10 Nov 2025 16:19:45 -0600 Subject: [PATCH 28/52] BUILD: Getting everything building with new examples --- examples/CMakeLists.txt | 157 +++++----- examples/mesh_decomp/CMakeLists.txt | 1 + examples/mesh_decomp/communication_plan.h | 292 +++++++++++------- examples/mesh_decomp/decomp_utils.h | 61 +++- examples/mesh_decomp/mpi_type.h | 50 +-- ...cation_plan.h => communication_plan_old.h} | 0 src/include/mapped_mpi_types.h | 2 +- 7 files changed, 324 insertions(+), 239 deletions(-) rename src/include/{communication_plan.h => communication_plan_old.h} (100%) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index e32ddb2d..4c379334 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -88,117 +88,118 @@ if (KOKKOS) add_definitions(-DHAVE_THREADS=1) endif() - # add_executable(testsetval test_set_values.cpp) - # target_link_libraries(testsetval ${LINKING_LIBRARIES}) + add_executable(testsetval test_set_values.cpp) + target_link_libraries(testsetval ${LINKING_LIBRARIES}) - # add_executable(mtestkokkos main_kokkos.cpp) - # target_link_libraries(mtestkokkos ${LINKING_LIBRARIES}) + add_executable(mtestkokkos main_kokkos.cpp) + target_link_libraries(mtestkokkos ${LINKING_LIBRARIES}) - # add_executable(drrak_test test_drrak.cpp) - # target_link_libraries(drrak_test ${LINKING_LIBRARIES}) + add_executable(drrak_test test_drrak.cpp) + target_link_libraries(drrak_test ${LINKING_LIBRARIES}) - # add_executable(test_kokkos_for kokkos_for.cpp) - # target_link_libraries(test_kokkos_for ${LINKING_LIBRARIES}) + add_executable(test_kokkos_for kokkos_for.cpp) + target_link_libraries(test_kokkos_for ${LINKING_LIBRARIES}) - # add_executable(test_dual_types test_dual_types.cpp) - # target_link_libraries(test_dual_types ${LINKING_LIBRARIES}) + add_executable(test_dual_types test_dual_types.cpp) + target_link_libraries(test_dual_types ${LINKING_LIBRARIES}) - # add_executable(kokkos_csr CSRKokkos.cpp) - # target_link_libraries(kokkos_csr ${LINKING_LIBRARIES}) + add_executable(kokkos_csr CSRKokkos.cpp) + target_link_libraries(kokkos_csr ${LINKING_LIBRARIES}) - # add_executable(kokkos_csc CSCKokkos.cpp) - # target_link_libraries(kokkos_csc ${LINKING_LIBRARIES}) + add_executable(kokkos_csc CSCKokkos.cpp) + target_link_libraries(kokkos_csc ${LINKING_LIBRARIES}) - # add_executable(mtr_kokkos-simple mtr-kokkos-simple.cpp) - # target_link_libraries(mtr_kokkos-simple ${LINKING_LIBRARIES}) + add_executable(mtr_kokkos-simple mtr-kokkos-simple.cpp) + target_link_libraries(mtr_kokkos-simple ${LINKING_LIBRARIES}) - # add_executable(annkokkos ann_kokkos.cpp) - # target_link_libraries(annkokkos ${LINKING_LIBRARIES}) + add_executable(annkokkos ann_kokkos.cpp) + target_link_libraries(annkokkos ${LINKING_LIBRARIES}) - # add_executable(annkokkos_compare ann_kokkos_compare.cpp) - # target_link_libraries(annkokkos_compare ${LINKING_LIBRARIES}) + add_executable(annkokkos_compare ann_kokkos_compare.cpp) + target_link_libraries(annkokkos_compare ${LINKING_LIBRARIES}) - # #add_executable(ompperftest ompperftest.cpp) - # #target_link_libraries(ompperftest ${LINKING_LIBRARIES}) + #add_executable(ompperftest ompperftest.cpp) + #target_link_libraries(ompperftest ${LINKING_LIBRARIES}) - # add_executable(lu_test test_lu_solve.cpp) - # target_link_libraries(lu_test ${LINKING_LIBRARIES}) + add_executable(lu_test test_lu_solve.cpp) + target_link_libraries(lu_test ${LINKING_LIBRARIES}) - # add_executable(qr_test test_qr_solve.cpp) - # target_link_libraries(qr_test ${LINKING_LIBRARIES}) + add_executable(qr_test test_qr_solve.cpp) + target_link_libraries(qr_test ${LINKING_LIBRARIES}) - # if (Matar_ENABLE_TRILINOS) - # add_executable(anndistributed ann_distributed.cpp) - # target_link_libraries(anndistributed ${LINKING_LIBRARIES}) + if (Matar_ENABLE_TRILINOS) + add_executable(anndistributed ann_distributed.cpp) + target_link_libraries(anndistributed ${LINKING_LIBRARIES}) - # add_executable(anndistributed_crs ann_distributed_crs.cpp) - # target_link_libraries(anndistributed_crs ${LINKING_LIBRARIES}) + add_executable(anndistributed_crs ann_distributed_crs.cpp) + target_link_libraries(anndistributed_crs ${LINKING_LIBRARIES}) + + add_executable(test_tpetra_farray test_tpetra_farray.cpp) + target_link_libraries(test_tpetra_farray ${LINKING_LIBRARIES}) - # add_executable(test_tpetra_farray test_tpetra_farray.cpp) - # target_link_libraries(test_tpetra_farray ${LINKING_LIBRARIES}) + add_executable(test_tpetra_carray test_tpetra_carray.cpp) + target_link_libraries(test_tpetra_carray ${LINKING_LIBRARIES}) - # add_executable(test_tpetra_carray test_tpetra_carray.cpp) - # target_link_libraries(test_tpetra_carray ${LINKING_LIBRARIES}) + add_executable(test_tpetra_crs test_tpetra_crs.cpp) + target_link_libraries(test_tpetra_crs ${LINKING_LIBRARIES}) - # add_executable(test_tpetra_crs test_tpetra_crs.cpp) - # target_link_libraries(test_tpetra_crs ${LINKING_LIBRARIES}) + add_executable(test_tpetra_mesh test_tpetra_mesh.cpp) + target_link_libraries(test_tpetra_mesh ${LINKING_LIBRARIES}) + endif() - # add_executable(test_tpetra_mesh test_tpetra_mesh.cpp) - # target_link_libraries(test_tpetra_mesh ${LINKING_LIBRARIES}) - # endif() + if (OPENMP) + add_executable(parallel_hello_world parallel_hello_world.cpp) + target_link_libraries(parallel_hello_world ${LINKING_LIBRARIES}) + endif() - # if (OPENMP) - # add_executable(parallel_hello_world parallel_hello_world.cpp) - # target_link_libraries(parallel_hello_world ${LINKING_LIBRARIES}) - # endif() + if (MPI) + include_directories(laplaceMPI) + add_subdirectory(laplaceMPI) - # if (MPI) - # include_directories(laplaceMPI) - # add_subdirectory(laplaceMPI) - # endif() + include_directories(mesh_decomp) + add_subdirectory(mesh_decomp) + endif() endif() -# ### HIP Linking error, will add back in after fixed -# if (NOT HIP) -# include_directories(virtualFcnKokkos) -# add_subdirectory(virtualFcnKokkos) -# endif() +### HIP Linking error, will add back in after fixed +if (NOT HIP) + include_directories(virtualFcnKokkos) + add_subdirectory(virtualFcnKokkos) +endif() -# # In testing, not working -# #include_directories(gArrayofgArrays) -# #add_subdirectory(gArrayofgArrays) +# In testing, not working +#include_directories(gArrayofgArrays) +#add_subdirectory(gArrayofgArrays) -# include_directories(virtualFcnMATAR) -# add_subdirectory(virtualFcnMATAR) +include_directories(virtualFcnMATAR) +add_subdirectory(virtualFcnMATAR) -# include_directories(laplace) -# add_subdirectory(laplace) +include_directories(laplace) +add_subdirectory(laplace) -# include_directories(halfspace_cooling) -# add_subdirectory(halfspace_cooling) +include_directories(halfspace_cooling) +add_subdirectory(halfspace_cooling) -# include_directories(watt-graph) -# add_subdirectory(watt-graph) +include_directories(watt-graph) +add_subdirectory(watt-graph) -# #include_directories(matar_fortran) -# #add_subdirectory(matar_fortran) +#include_directories(matar_fortran) +#add_subdirectory(matar_fortran) -# include_directories(sparsetests) -# add_subdirectory(sparsetests) +include_directories(sparsetests) +add_subdirectory(sparsetests) -# include_directories(test_rocm) -# add_subdirectory(test_rocm) +include_directories(test_rocm) +add_subdirectory(test_rocm) -include_directories(mesh_decomp) -add_subdirectory(mesh_decomp) -#include_directories(phaseField/srcKokkosVerbose) -#add_subdirectory(phaseField/srcKokkosVerbose) +# include_directories(phaseField/srcKokkosVerbose) +# add_subdirectory(phaseField/srcKokkosVerbose) -#include_directories(phaseField/srcMacros) -#add_subdirectory(phaseField/srcMacros) +# include_directories(phaseField/srcMacros) +# add_subdirectory(phaseField/srcMacros) -#include_directories(phaseFieldMPI) -#add_subdirectory(phaseFieldMPI) +# include_directories(phaseFieldMPI) +# add_subdirectory(phaseFieldMPI) diff --git a/examples/mesh_decomp/CMakeLists.txt b/examples/mesh_decomp/CMakeLists.txt index 7b7306cd..b5ea83ca 100644 --- a/examples/mesh_decomp/CMakeLists.txt +++ b/examples/mesh_decomp/CMakeLists.txt @@ -2,6 +2,7 @@ cmake_minimum_required(VERSION 3.1.3) # Find MPI find_package(MPI REQUIRED) +add_definitions(-DHAVE_MPI=1) find_package(Matar REQUIRED) diff --git a/examples/mesh_decomp/communication_plan.h b/examples/mesh_decomp/communication_plan.h index eabba8da..2023f609 100644 --- a/examples/mesh_decomp/communication_plan.h +++ b/examples/mesh_decomp/communication_plan.h @@ -1,6 +1,8 @@ #ifndef COMMUNICATION_PLAN_H #define COMMUNICATION_PLAN_H +#ifdef HAVE_MPI +#include #include "matar.h" using namespace mtr; @@ -58,7 +60,6 @@ using namespace mtr; // This is a good optimization for large meshes, but will require maps from MPI_comm_world rank IDs to the new reordered rank IDs. int reorder = 0; - DRaggedRightArrayKokkos send_indices_; // [size: num_send_ranks, num_items_to_send_per_rank] Indices of items to send to each rank DRaggedRightArrayKokkos recv_indices_; // [size: num_recv_ranks, num_items_to_recv_per_rank] Indices of items to receive from each rank @@ -69,12 +70,9 @@ using namespace mtr; DCArrayKokkos send_displs_; // [size: num_send_ranks] Starting index of items to send to each rank DCArrayKokkos recv_displs_; // [size: num_recv_ranks] Starting index of items to receive from each rank - int total_send_count; - int total_recv_count; + int total_send_count; // Total number of items to send + int total_recv_count; // Total number of items to receive - - - // ======================================================================== // CONSTRUCTOR / INITIALIZATION // ======================================================================== @@ -99,148 +97,218 @@ using namespace mtr; MPI_Comm_size(comm_world, &world_size); } + /** + * @brief Initialize an MPI distributed graph communicator for sparse neighbor communication. + * + * This function creates an MPI "dist graph communicator" tailored to the sparse data exchange + * patterns typical in mesh-based parallel applications. It establishes direct knowledge for MPI + * about which processes (ranks) each process will communicate with. This improves the efficiency + * and clarity of later communication (for example, with MPI_Neighbor_alltoallv). + * + * This function is especially useful when the communication pattern is not all-to-all, but rather + * a sparse subset: for instance, where each process only exchanges data with a few neighbors. + * + * ==== Key Concepts ==== + * - MPI Communicator: An MPI object representing a group of processes that can communicate with each other. + * For context, "MPI_COMM_WORLD" is a communicator including all processes, but a graph communicator + * customizes direct process connections. + * - Rank: Integer ID identifying a process in a communicator. + * - Distributed Graph: MPI can represent communication as a directed sparse graph, with edges from + * this rank to those it needs to send to, and from those it will receive from. + * + * ==== Parameters ==== + * @param num_send_ranks [in] Number of ranks this process will send data to (out-neighbors). + * @param send_rank_ids [in] Array of size num_send_ranks; each entry is the rank of a process to send to. + * @param num_recv_ranks [in] Number of ranks this process will receive data from (in-neighbors). + * @param recv_rank_ids [in] Array of size num_recv_ranks; each entry is the rank of a process to receive from. + * + * ==== Steps ==== + * + * 1. Checks if the basic communicator has been initialized. + * Throws an error if it has not. + * + * 2. Stores the send/receive neighbor counts and rank lists internally. + * Copies the IDs into the internal device-host arrays. + * - send_rank_ids: process IDs that will be destinations for outgoing messages. + * - recv_rank_ids: process IDs that will provide incoming messages. + * + * 3. Calls MPI_Dist_graph_create_adjacent: + * This constructs a new MPI communicator ("mpi_comm_graph") that encodes this process's + * inbound and outbound neighbors. MPI uses this to optimize and route messages directly + * and efficiently during later neighbor collectives. + * + * - Note: The 'recv_weights' and 'send_weights' arguments are set to NULL here; + * this means we are not giving extra weighting or priorities to any connection. + * - The 'reorder' argument (set to 0 in this class) disables rank reordering; + * this ensures the assignment of process ranks is preserved, which is often needed + * for mapping data or results back to physical entities. + * - On return, 'mpi_comm_graph' will allow use of "neighbor" collectives (MPI_Neighbor_alltoall[v], etc.), + * which automatically use the provided topology to send/receive to only neighbors efficiently. + * + * 4. Marks the internal flag indicating that the graph communicator has been set up ("has_comm_graph"). + * + * ==== Example Usage ==== + * Suppose rank 0 will send to ranks 1 and 2, and receive from rank 3 only: + * int send_ranks[2] = {1, 2}; + * int recv_ranks[1] = {3}; + * initialize_graph_communicator(2, send_ranks, 1, recv_ranks); + * + * ==== Why Use This? ==== + * - This avoids the need to do manual pairwise MPI_Send/MPI_Recv in your code, + * and enables the use of neighbor collectives -- concise, scalable, and hard-to-get-wrong. + * - It explicitly tells MPI only about your neighbors, so it can optimize routes and memory. + * - If you have a large number of processes or a mesh/network with only local coupling, + * this approach scales much better than using global/all-to-all communication. + * + * @throws std::runtime_error if the base communicator has not been initialized. + */ void initialize_graph_communicator(int num_send_ranks, int* send_rank_ids, int num_recv_ranks, int* recv_rank_ids){ + // Check if the MPI_COMM_WORLD communicator has been initialized. if(!has_comm_world){ throw std::runtime_error("MPI communicator for the world has not been initialized"); } + // Store the number of outbound and inbound neighbors this->num_send_ranks = num_send_ranks; this->num_recv_ranks = num_recv_ranks; + // Copy and store send neighbor IDs (out-bound neighbors: where we will send data to) this->send_rank_ids = DCArrayKokkos(num_send_ranks, "send_rank_ids"); for(int i = 0; i < num_send_ranks; i++){ this->send_rank_ids(i) = send_rank_ids[i]; } - + // Copy and store receive neighbor IDs (in-bound neighbors: where we will receive data from) this->recv_rank_ids = DCArrayKokkos(num_recv_ranks, "recv_rank_ids"); for(int i = 0; i < num_recv_ranks; i++){ this->recv_rank_ids(i) = recv_rank_ids[i]; } + // Create the distributed graph communicator. + // This call links this process to its explicit send and receive neighbors. + // See https://www.open-mpi.org/doc/v4.0/man3/MPI_Dist_graph_create_adjacent.3.php for more details. MPI_Dist_graph_create_adjacent( - mpi_comm_world, - num_recv_ranks, - this->recv_rank_ids.host_pointer(), - recv_weights, - num_send_ranks, - this->send_rank_ids.host_pointer(), - send_weights, - info, - reorder, - &mpi_comm_graph + mpi_comm_world, // Existing communicator (usually MPI_COMM_WORLD) + num_recv_ranks, // Number of in-neighbors (recv) + this->recv_rank_ids.host_pointer(), // Array of in-neighbor ranks (who we receive from) + recv_weights, // Edge weights (NULL = unweighted) + num_send_ranks, // Number of out-neighbors (send) + this->send_rank_ids.host_pointer(), // Array of out-neighbor ranks (who we send to) + send_weights, // Edge weights (NULL = unweighted) + info, // Additional info for MPI (not used, set to MPI_INFO_NULL) + reorder, // Allow MPI to reorder ranks for performance (0 disables) + &mpi_comm_graph // [out] New graph communicator ); + // Set the internal flag indicating that we have created the MPI distributed graph communicator. has_comm_graph = true; } - void verify_graph_communicator(){ - if(!has_comm_graph){ - throw std::runtime_error("MPI graph communicator has not been initialized"); - } - - // ============================================================================ - // Verify the distributed graph communicator - // ============================================================================ - // Query the graph to verify it matches what we specified - int indegree_out, outdegree_out, weighted; - MPI_Dist_graph_neighbors_count(mpi_comm_graph, &indegree_out, &outdegree_out, &weighted); + // void verify_graph_communicator(){ + // if(!has_comm_graph){ + // throw std::runtime_error("MPI graph communicator has not been initialized"); + // } + + // // ============================================================================ + // // Verify the distributed graph communicator + // // ============================================================================ + // // Query the graph to verify it matches what we specified + // int indegree_out, outdegree_out, weighted; + // MPI_Dist_graph_neighbors_count(mpi_comm_graph, &indegree_out, &outdegree_out, &weighted); - // Allocate arrays to receive neighbor information - std::vector sources_out(indegree_out); - std::vector sourceweights_out(indegree_out); - std::vector destinations_out(outdegree_out); - std::vector destweights_out(outdegree_out); + // // Allocate arrays to receive neighbor information + // std::vector sources_out(indegree_out); + // std::vector sourceweights_out(indegree_out); + // std::vector destinations_out(outdegree_out); + // std::vector destweights_out(outdegree_out); - // Retrieve the actual neighbors from the graph communicator - MPI_Dist_graph_neighbors(mpi_comm_graph, - indegree_out, sources_out.data(), sourceweights_out.data(), - outdegree_out, destinations_out.data(), destweights_out.data()); + // // Retrieve the actual neighbors from the graph communicator + // MPI_Dist_graph_neighbors(mpi_comm_graph, + // indegree_out, sources_out.data(), sourceweights_out.data(), + // outdegree_out, destinations_out.data(), destweights_out.data()); - int rank = -1; - MPI_Comm_rank(mpi_comm_world, &rank); + // int rank = -1; + // MPI_Comm_rank(mpi_comm_world, &rank); - // Additional verification: Check if the queried values match our input - bool verification_passed = true; + // // Additional verification: Check if the queried values match our input + // bool verification_passed = true; - // Print verification information for each rank sequentially - for (int r = 0; r < world_size; ++r) { - MPI_Barrier(mpi_comm_world); - if (rank == r) { - std::cout << "\n[rank " << rank << "] Graph Communicator Verification:" << std::endl; - std::cout << " Indegree (receives from " << indegree_out << " ranks): "; - for (int i = 0; i < indegree_out; ++i) { - std::cout << sources_out[i] << " "; - } - std::cout << std::endl; + // // Print verification information for each rank sequentially + // for (int r = 0; r < world_size; ++r) { + // MPI_Barrier(mpi_comm_world); + // if (rank == r) { + // std::cout << "\n[rank " << rank << "] Graph Communicator Verification:" << std::endl; + // std::cout << " Indegree (receives from " << indegree_out << " ranks): "; + // for (int i = 0; i < indegree_out; ++i) { + // std::cout << sources_out[i] << " "; + // } + // std::cout << std::endl; - std::cout << " Outdegree (sends to " << outdegree_out << " ranks): "; - for (int i = 0; i < outdegree_out; ++i) { - std::cout << destinations_out[i] << " "; - } - std::cout << std::endl; + // std::cout << " Outdegree (sends to " << outdegree_out << " ranks): "; + // for (int i = 0; i < outdegree_out; ++i) { + // std::cout << destinations_out[i] << " "; + // } + // std::cout << std::endl; - std::cout << " Weighted: " << (weighted ? "yes" : "no") << std::endl; - } - MPI_Barrier(mpi_comm_world); - } + // std::cout << " Weighted: " << (weighted ? "yes" : "no") << std::endl; + // } + // MPI_Barrier(mpi_comm_world); + // } - // Check if the counts match our stored values - if (indegree_out != num_recv_ranks) { - std::cerr << "[rank " << rank << "] ERROR: indegree mismatch! " - << "Expected " << num_recv_ranks << ", got " << indegree_out << std::endl; - verification_passed = false; - } - if (outdegree_out != num_send_ranks) { - std::cerr << "[rank " << rank << "] ERROR: outdegree mismatch! " - << "Expected " << num_send_ranks << ", got " << outdegree_out << std::endl; - verification_passed = false; - } + // // Check if the counts match our stored values + // if (indegree_out != num_recv_ranks) { + // std::cerr << "[rank " << rank << "] ERROR: indegree mismatch! " + // << "Expected " << num_recv_ranks << ", got " << indegree_out << std::endl; + // verification_passed = false; + // } + // if (outdegree_out != num_send_ranks) { + // std::cerr << "[rank " << rank << "] ERROR: outdegree mismatch! " + // << "Expected " << num_send_ranks << ", got " << outdegree_out << std::endl; + // verification_passed = false; + // } - // Check if source ranks match (build set from our stored recv_rank_ids) - std::set sources_set_in; - for (int i = 0; i < num_recv_ranks; ++i) { - sources_set_in.insert(recv_rank_ids.host(i)); - } - std::set sources_set_out(sources_out.begin(), sources_out.end()); - if (sources_set_in != sources_set_out) { - std::cerr << "[rank " << rank << "] ERROR: source ranks mismatch!" << std::endl; - verification_passed = false; - } + // // Check if source ranks match (build set from our stored recv_rank_ids) + // std::set sources_set_in; + // for (int i = 0; i < num_recv_ranks; ++i) { + // sources_set_in.insert(recv_rank_ids.host(i)); + // } + // std::set sources_set_out(sources_out.begin(), sources_out.end()); + // if (sources_set_in != sources_set_out) { + // std::cerr << "[rank " << rank << "] ERROR: source ranks mismatch!" << std::endl; + // verification_passed = false; + // } - // Check if destination ranks match (build set from our stored send_rank_ids) - std::set dests_set_in; - for (int i = 0; i < num_send_ranks; ++i) { - dests_set_in.insert(send_rank_ids.host(i)); - } - std::set dests_set_out(destinations_out.begin(), destinations_out.end()); - if (dests_set_in != dests_set_out) { - std::cerr << "[rank " << rank << "] ERROR: destination ranks mismatch!" << std::endl; - verification_passed = false; - } + // // Check if destination ranks match (build set from our stored send_rank_ids) + // std::set dests_set_in; + // for (int i = 0; i < num_send_ranks; ++i) { + // dests_set_in.insert(send_rank_ids.host(i)); + // } + // std::set dests_set_out(destinations_out.begin(), destinations_out.end()); + // if (dests_set_in != dests_set_out) { + // std::cerr << "[rank " << rank << "] ERROR: destination ranks mismatch!" << std::endl; + // verification_passed = false; + // } - // Global verification check - int local_passed = verification_passed ? 1 : 0; - int global_passed = 0; - MPI_Allreduce(&local_passed, &global_passed, 1, MPI_INT, MPI_MIN, mpi_comm_world); - MPI_Barrier(mpi_comm_world); - if (rank == 0) { - if (global_passed) { - std::cout << "\n✓ Graph communicator verification PASSED on all ranks\n" << std::endl; - } else { - std::cout << "\n✗ Graph communicator verification FAILED on one or more ranks\n" << std::endl; - } - } - MPI_Barrier(mpi_comm_world); - } - + // // Global verification check + // int local_passed = verification_passed ? 1 : 0; + // int global_passed = 0; + // MPI_Allreduce(&local_passed, &global_passed, 1, MPI_INT, MPI_MIN, mpi_comm_world); + // MPI_Barrier(mpi_comm_world); + // if (rank == 0) { + // if (global_passed) { + // std::cout << "\n✓ Graph communicator verification PASSED on all ranks\n" << std::endl; + // } else { + // std::cout << "\n✗ Graph communicator verification FAILED on one or more ranks\n" << std::endl; + // } + // } + // MPI_Barrier(mpi_comm_world); + // } void setup_send_recv(DRaggedRightArrayKokkos &rank_send_ids, DRaggedRightArrayKokkos &rank_recv_ids){ - this->send_indices_ = rank_send_ids; // ods of element data to send to each rank - this->recv_indices_ = rank_recv_ids; // - + this->send_indices_ = rank_send_ids; // indices of element data to send to each rank + this->recv_indices_ = rank_recv_ids; // indices of element data to receive from each rank // Setup send data this->send_counts_ = DCArrayKokkos(num_send_ranks, "send_counts"); @@ -280,9 +348,9 @@ using namespace mtr; MPI_Barrier(mpi_comm_world); } +}; // End of CommunicationPlan -}; - -#endif // COMMUNICATION_PLAN_H +#endif // end if HAVE_MPI +#endif // end if COMMUNICATION_PLAN_H diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h index dada0d99..f0e7ae4d 100644 --- a/examples/mesh_decomp/decomp_utils.h +++ b/examples/mesh_decomp/decomp_utils.h @@ -21,6 +21,29 @@ #include "scotch.h" #include "ptscotch.h" +/** + * @brief Partitions the input mesh into a naive element-based decomposition across MPI ranks. + * + * This function splits the input mesh (and its associated node information) evenly among the given number of MPI ranks. + * It assigns contiguous blocks of elements (and the corresponding nodes and nodal data) to each rank. + * + * The function constructs: + * - The sub-mesh (naive_mesh) and its nodes (naive_node) for the local rank. + * - Maps and vectors indicating elements and nodes present on each rank. + * - Auxiliary arrays (elems_in_elem_on_rank, num_elems_in_elem_per_rank) for local element connectivity and neighbor look-ups. + * + * The decomposition is "naive" in that it uses a simple contiguous block assignment, without regard to mesh topology or quality of partitioning. + * This function is generally used as the preliminary step before repartitioning with tools like PT-Scotch or for algorithm prototyping. + * + * @param initial_mesh[in] The input mesh containing all elements/nodes on rank 0. + * @param initial_node[in] The nodal data for the input mesh on rank 0. + * @param naive_mesh[out] The mesh on this rank after naive partitioning. + * @param naive_node[out] The nodal data on this rank after naive partitioning. + * @param elems_in_elem_on_rank[out] Vector of element-to-element connectivity for this rank's local mesh. + * @param num_elems_in_elem_per_rank[out] Vector of counts for element neighbors for each local element. + * @param world_size[in] Number of MPI ranks (world size). + * @param rank[in] This MPI rank's id. + */ void naive_partition_mesh( Mesh_t& initial_mesh, @@ -530,6 +553,32 @@ void naive_partition_mesh( } +/** + * @brief Partitions the input mesh using PT-Scotch and constructs the final distributed mesh. + * + * This function performs parallel mesh partitioning using a two-stage approach: + * 1. A naive partition is first constructed (simple assignment of mesh elements/nodes across ranks). + * 2. PT-Scotch is then used to repartition the mesh for load balancing and improved connectivity. + * + * The partitioned mesh, nodal data, and associated connectivity/gauss point information + * are distributed among MPI ranks as a result. The procedure ensures that each rank receives + * its assigned portion of the mesh and associated data in the final (target) decomposition. + * + * @param initial_mesh[in] The input (global) mesh, present on rank 0 or all ranks at start. + * @param final_mesh[out] The mesh assigned to this rank after PT-Scotch decomposition. + * @param initial_node[in] Nodal data for the input (global) mesh; must match initial_mesh. + * @param final_node[out] Nodal data for this rank after decomposition (corresponds to final_mesh). + * @param gauss_point[out] Gauss point data structure, filled out for this rank's mesh. + * @param world_size[in] Number of MPI ranks in use (the total number of partitions). + * @param rank[in] This process's MPI rank ID. + * + * Internals: + * - The routine uses a naive_partition_mesh() helper to create an initial contiguous mesh partition. + * - It then uses PT-Scotch distributed graph routines to compute an improved partition and create the final mesh layout. + * - Both element-to-element and node-to-element connectivity, as well as mapping and ghosting information, + * are managed and exchanged across ranks. + * - MPI routines synchronize and exchange the relevant mesh and nodal data following the computed partition. + */ void partition_mesh( Mesh_t& initial_mesh, @@ -557,13 +606,9 @@ void partition_mesh( std::vector elems_in_elem_on_rank; std::vector num_elems_in_elem_per_rank; - naive_partition_mesh(initial_mesh, initial_node, naive_mesh, naive_node, elems_in_elem_on_rank, num_elems_in_elem_per_rank, world_size, rank); - - -// ****************************************************************************************** -// Compute a repartition of the mesh using pt-scotch -// ****************************************************************************************** + // Perform the naive partitioning of the mesh + naive_partition_mesh(initial_mesh, initial_node, naive_mesh, naive_node, elems_in_elem_on_rank, num_elems_in_elem_per_rank, world_size, rank); /********************************************************************************** @@ -1906,7 +1951,7 @@ void partition_mesh( element_communication_plan.initialize_graph_communicator(outdegree, ghost_comm_ranks_vec.data(), indegree, ghost_elem_receive_ranks_vec.data()); MPI_Barrier(MPI_COMM_WORLD); // Optional: Verify the graph communicator was created successfully - if(print_info) element_communication_plan.verify_graph_communicator(); + // if(print_info) element_communication_plan.verify_graph_communicator(); // ****************************************************************************************** @@ -2036,7 +2081,7 @@ void partition_mesh( // -------------------------------------------------------------------------------------- -// Build the send pattern for nodes + // TODO: Build the send pattern for nodes -------------------------------------------------------------------------------------- // Build reverse map via global IDs: for each local node gid, find ranks that ghost it. // Steps: // 1) Each rank contributes its ghost node GIDs. diff --git a/examples/mesh_decomp/mpi_type.h b/examples/mesh_decomp/mpi_type.h index 98f62313..be7984d5 100644 --- a/examples/mesh_decomp/mpi_type.h +++ b/examples/mesh_decomp/mpi_type.h @@ -1,12 +1,13 @@ #ifndef MPICARRAYKOKKOS_H #define MPICARRAYKOKKOS_H +// #ifdef HAVE_MPI +#include #include "matar.h" #include "communication_plan.h" -using namespace mtr; - -// Add this before the MPICArrayKokkos class definition +namespace mtr +{ // Type trait to map C++ types to MPI_Datatype template @@ -70,8 +71,7 @@ struct mpi_type_map { ///////////////////////// -// MPICArrayKokkos: Dual type for managing distributed data on both CPU and GPU. -// +// MPICArrayKokkos: Type for managing distributed data on both CPU and GPU. ///////////////////////// template class MPICArrayKokkos { @@ -322,40 +322,6 @@ class MPICArrayKokkos { this_array_.set_values(value); }; - - void reduce_sum(T& result){}; - - - // // MPI send wrapper - // void send(size_t count, int dest, int tag, MPI_Comm comm); - - // // MPI recieve wrapper - // void recv(size_t count, int dest, int tag, MPI_Comm comm); - - // // MPI broadcast wrapper - // void broadcast(size_t count, int root, MPI_Comm comm); - - // // MPI scatter wrapper - // void scatter(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm); - - // // MPI gather wrapper - // void gather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm); - - // // MPI allgather wrapper - // void allgather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, MPI_Comm comm); - - // // MPI send wrapper - // void isend(size_t count, int dest, int tag, MPI_Comm comm); - - // // MPI recieve wrapper - // void irecv(size_t count, int dest, int tag, MPI_Comm comm); - - // // MPI wait wrapper for sender - // void wait_send(); - - // // MPI wait wrapper for receiver - // void wait_recv(); - // Deconstructor virtual KOKKOS_INLINE_FUNCTION ~MPICArrayKokkos (); @@ -606,4 +572,8 @@ MPICArrayKokkos::~MPICArrayKokkos() { } -#endif \ No newline at end of file +} // end namespace mtr + + +// #endif // end if have MPI +#endif // end if MPICARRAYKOKKOS_H \ No newline at end of file diff --git a/src/include/communication_plan.h b/src/include/communication_plan_old.h similarity index 100% rename from src/include/communication_plan.h rename to src/include/communication_plan_old.h diff --git a/src/include/mapped_mpi_types.h b/src/include/mapped_mpi_types.h index 6d5d18d3..3c0ca4d0 100644 --- a/src/include/mapped_mpi_types.h +++ b/src/include/mapped_mpi_types.h @@ -45,7 +45,7 @@ #include #include #include "partition_map.h" -#include "communication_plan.h" +// #include "communication_plan.h" namespace mtr { From 70a30cfe184340106c15af1b2f3b6ab965a5c8b9 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Mon, 10 Nov 2025 16:31:18 -0600 Subject: [PATCH 29/52] ENH: Working through build with examples --- examples/mesh_decomp/state.h | 2 +- .../include}/communication_plan.h | 0 src/include/mapped_mpi_types.h | 1 - src/include/mpi_types.h | 897 +++++++----------- src/include/mpi_types_old.h | 784 +++++++++++++++ 5 files changed, 1131 insertions(+), 553 deletions(-) rename {examples/mesh_decomp => src/include}/communication_plan.h (100%) create mode 100644 src/include/mpi_types_old.h diff --git a/examples/mesh_decomp/state.h b/examples/mesh_decomp/state.h index 2ed970d5..0da00095 100644 --- a/examples/mesh_decomp/state.h +++ b/examples/mesh_decomp/state.h @@ -35,7 +35,7 @@ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define STATE_H #include "matar.h" -#include "mpi_type.h" +// #include "mpi_type.h" using namespace mtr; diff --git a/examples/mesh_decomp/communication_plan.h b/src/include/communication_plan.h similarity index 100% rename from examples/mesh_decomp/communication_plan.h rename to src/include/communication_plan.h diff --git a/src/include/mapped_mpi_types.h b/src/include/mapped_mpi_types.h index 3c0ca4d0..ed690ca6 100644 --- a/src/include/mapped_mpi_types.h +++ b/src/include/mapped_mpi_types.h @@ -45,7 +45,6 @@ #include #include #include "partition_map.h" -// #include "communication_plan.h" namespace mtr { diff --git a/src/include/mpi_types.h b/src/include/mpi_types.h index b10a57fc..ac651551 100644 --- a/src/include/mpi_types.h +++ b/src/include/mpi_types.h @@ -1,120 +1,149 @@ -#ifndef MPI_TYPES_H -#define MPI_TYPES_H -/********************************************************************************************** - © 2020. Triad National Security, LLC. All rights reserved. - This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos - National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S. - Department of Energy/National Nuclear Security Administration. All rights in the program are - reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear - Security Administration. The Government is granted for itself and others acting on its behalf a - nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare - derivative works, distribute copies to the public, perform publicly and display publicly, and - to permit others to do so. - This program is open source under the BSD-3 License. - Redistribution and use in source and binary forms, with or without modification, are permitted - provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, this list of - conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright notice, this list of - conditions and the following disclaimer in the documentation and/or other materials - provided with the distribution. - - 3. Neither the name of the copyright holder nor the names of its contributors may be used - to endorse or promote products derived from this software without specific prior - written permission. - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS - IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; - OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, - WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR - OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF - ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - **********************************************************************************************/ - -#include "host_types.h" -#include "kokkos_types.h" -#include +#ifndef MPICARRAYKOKKOS_H +#define MPICARRAYKOKKOS_H + #ifdef HAVE_MPI #include +#include "matar.h" +#include "communication_plan.h" namespace mtr { +// Type trait to map C++ types to MPI_Datatype +template +struct mpi_type_map { + static MPI_Datatype value() { + static_assert(sizeof(T) == 0, "Unsupported type for MPI communication"); + return MPI_DATATYPE_NULL; + } +}; + +// Specializations for common types +template <> +struct mpi_type_map { + static MPI_Datatype value() { return MPI_INT; } +}; + +template <> +struct mpi_type_map { + static MPI_Datatype value() { return MPI_LONG; } +}; + +template <> +struct mpi_type_map { + static MPI_Datatype value() { return MPI_LONG_LONG; } +}; + +template <> +struct mpi_type_map { + static MPI_Datatype value() { return MPI_UNSIGNED; } +}; + +template <> +struct mpi_type_map { + static MPI_Datatype value() { return MPI_UNSIGNED_LONG; } +}; + +template <> +struct mpi_type_map { + static MPI_Datatype value() { return MPI_FLOAT; } +}; + +template <> +struct mpi_type_map { + static MPI_Datatype value() { return MPI_DOUBLE; } +}; + +template <> +struct mpi_type_map { + static MPI_Datatype value() { return MPI_CHAR; } +}; + +template <> +struct mpi_type_map { + static MPI_Datatype value() { return MPI_UNSIGNED_CHAR; } +}; + +template <> +struct mpi_type_map { + static MPI_Datatype value() { return MPI_C_BOOL; } +}; + + ///////////////////////// -// MPIArrayKokkos: Dual type for managing distributed data on both CPU and GPU. +// MPICArrayKokkos: Type for managing distributed data on both CPU and GPU. ///////////////////////// template -class MPIArrayKokkos { +class MPICArrayKokkos { - // this is manage - using TArray1D = Kokkos::DualView ; + // Dual view for managing data on both CPU and GPU + DCArrayKokkos this_array_; + + DCArrayKokkos send_buffer_; + DCArrayKokkos recv_buffer_; protected: size_t dims_[7]; size_t length_; size_t order_; // tensor order (rank) - int mpi_recv_rank_; - int mpi_tag_; + MPI_Comm mpi_comm_; MPI_Status mpi_status_; MPI_Datatype mpi_datatype_; MPI_Request mpi_request_; - TArray1D this_array_; + - void set_mpi_type(); + // --- Ghost Communication Support --- + CommunicationPlan* comm_plan_; // Pointer to shared communication plan + + + DCArrayKokkos send_counts_; // [size: num_send_ranks] Number of items to send to each rank + DCArrayKokkos recv_counts_; // [size: num_recv_ranks] Number of items to receive from each rank + DCArrayKokkos send_displs_; // [size: num_send_ranks] Starting index of items to send to each rank + DCArrayKokkos recv_displs_; // [size: num_recv_ranks] Starting index of items to receive from each rank + + size_t stride_; // [size: num_dims] Number of contiguous values per first index element + + + DRaggedRightArrayKokkos send_indices_; // [size: num_send_ranks, num_items_to_send_by_rank] Indices of items to send to each rank + DRaggedRightArrayKokkos recv_indices_; // [size: num_recv_ranks, num_items_to_recv_by_rank] Indices of items to receive from each rank + + + size_t num_owned_; // Number of owned items (nodes/elements) + size_t num_ghost_; // Number of ghost items (nodes/elements) public: // Data member to access host view ViewCArray host; - MPIArrayKokkos(); + + // Note, consider this for sending blocks without dealing with stride_ + // MPI_Datatype vector_type; + // MPI_Type_contiguous(stride_, mpi_type_map::value(), &vector_type); + // MPI_Type_commit(&vector_type); + + MPICArrayKokkos(); - MPIArrayKokkos(size_t dim0, const std::string& tag_string = DEFAULTSTRINGARRAY); + MPICArrayKokkos(size_t dim0, const std::string& tag_string = DEFAULTSTRINGARRAY); - MPIArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY); + MPICArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY); - MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY); + MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY); - MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2, + MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, const std::string& tag_string = DEFAULTSTRINGARRAY); - MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2, + MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, const std::string& tag_string = DEFAULTSTRINGARRAY); - MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2, + MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string = DEFAULTSTRINGARRAY); - MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2, + MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, size_t dim6, const std::string& tag_string = DEFAULTSTRINGARRAY); - // These functions can setup the data needed for halo send/receives - // Not necessary for standard MPI comms - void mpi_setup(); - - void mpi_setup(int recv_rank); - - void mpi_setup(int recv_rank, int tag); - - void mpi_setup(int recv_rank, int tag, MPI_Comm comm); - - void mpi_set_rank(int recv_rank); - - void mpi_set_tag(int tag); - - void mpi_set_comm(MPI_Comm comm); - - int get_rank(); - - int get_tag(); - MPI_Comm get_comm(); KOKKOS_INLINE_FUNCTION T& operator()(size_t i) const; @@ -140,7 +169,48 @@ class MPIArrayKokkos { size_t n, size_t o) const; KOKKOS_INLINE_FUNCTION - MPIArrayKokkos& operator=(const MPIArrayKokkos& temp); + MPICArrayKokkos& operator=(const MPICArrayKokkos& temp); + + + // Method to set comm plan for halo communication + void initialize_comm_plan(CommunicationPlan& comm_plan){ + comm_plan_ = &comm_plan; + + size_t send_size = comm_plan_->total_send_count * stride_; + size_t recv_size = comm_plan_->total_recv_count * stride_; + + if (send_size > 0) { + send_buffer_ = DCArrayKokkos(send_size, "send_buffer"); + } + if (recv_size > 0) { + recv_buffer_ = DCArrayKokkos(recv_size, "recv_buffer"); + } + + if (comm_plan_->num_send_ranks > 0) { + send_counts_ = DCArrayKokkos(comm_plan_->num_send_ranks, "send_counts"); + send_displs_ = DCArrayKokkos(comm_plan_->num_send_ranks, "send_displs"); + + for(int i = 0; i < comm_plan_->num_send_ranks; i++){ + send_counts_.host(i) = comm_plan_->send_counts_.host(i) * stride_; + send_displs_.host(i) = comm_plan_->send_displs_.host(i) * stride_; + } + send_counts_.update_device(); + send_displs_.update_device(); + } + + if (comm_plan_->num_recv_ranks > 0) { + recv_counts_ = DCArrayKokkos(comm_plan_->num_recv_ranks, "recv_counts"); + recv_displs_ = DCArrayKokkos(comm_plan_->num_recv_ranks, "recv_displs"); + + for(int i = 0; i < comm_plan_->num_recv_ranks; i++){ + recv_counts_.host(i) = comm_plan_->recv_counts_.host(i) * stride_; + recv_displs_.host(i) = comm_plan_->recv_displs_.host(i) * stride_; + } + recv_counts_.update_device(); + recv_displs_.update_device(); + } + }; + // GPU Method // Method that returns size @@ -168,7 +238,7 @@ class MPIArrayKokkos { // Method returns kokkos dual view KOKKOS_INLINE_FUNCTION - TArray1D get_kokkos_dual_view() const; + Kokkos::DualView get_kokkos_dual_view() const; // Method that update host view void update_host(); @@ -176,167 +246,170 @@ class MPIArrayKokkos { // Method that update device view void update_device(); - // MPI send wrapper - void send(size_t count, int dest, int tag, MPI_Comm comm); - - // MPI recieve wrapper - void recv(size_t count, int dest, int tag, MPI_Comm comm); - - // MPI broadcast wrapper - void broadcast(size_t count, int root, MPI_Comm comm); - - // MPI scatter wrapper - void scatter(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm); - - // MPI gather wrapper - void gather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm); - - // MPI allgather wrapper - void allgather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, MPI_Comm comm); - - // MPI send wrapper - void isend(size_t count, int dest, int tag, MPI_Comm comm); - - // MPI recieve wrapper - void irecv(size_t count, int dest, int tag, MPI_Comm comm); - - // MPI wait wrapper for sender - void wait_send(); - - // MPI wait wrapper for receiver - void wait_recv(); - - // MPI barrier wrapper - //void barrier(MPI_Comm comm); - - // MPI send wrapper - void halo_send(); - - // MPI recieve wrapper - void halo_recv(); - - // MPI send wrapper - void halo_isend(); - - // MPI recieve wrapper - void halo_irecv(); + // Method that builds the send buffer, note, this has to be ordered + // Such that all the boundary elements going to a given rank are contiguous in the send buffer. + void fill_send_buffer(){ + + size_t send_idx = 0; + for(int i = 0; i < comm_plan_->num_send_ranks; i++){ + for(int j = 0; j < comm_plan_->send_counts_.host(i); j++){ + size_t src_idx = comm_plan_->send_indices_.host(i, j); // index of the element to send + + // Copy all values associated with this element (handles multi-dimensional arrays) + for(size_t k = 0; k < stride_; k++){ + send_buffer_.host(send_idx + k) = this_array_.host_pointer()[src_idx * stride_ + k]; + } + send_idx += stride_; + } + } + }; + + // Method that copies the recv buffer into the this_array + void copy_recv_buffer(){ + + size_t recv_idx = 0; + for(int i = 0; i < comm_plan_->num_recv_ranks; i++){ + for(int j = 0; j < comm_plan_->recv_counts_.host(i); j++){ + size_t dest_idx = comm_plan_->recv_indices_.host(i, j); + + // Copy all values associated with this element (handles multi-dimensional arrays) + for(size_t k = 0; k < stride_; k++){ + this_array_.host_pointer()[dest_idx * stride_ + k] = recv_buffer_.host(recv_idx + k); + } + + recv_idx += stride_; + } + } + this_array_.update_device(); + }; + + + // Note: This "may" be needed, im not sure. Currently, it works.... + // Use nullptr for empty arrays to avoid accessing element 0 of 0-sized array (undefined behavior) + // T* send_buf_ptr = (send_buffer_.size() > 0) ? &send_buffer_.host(0) : nullptr; + // T* recv_buf_ptr = (recv_buffer_.size() > 0) ? &recv_buffer_.host(0) : nullptr; + // int* send_cnt_ptr = (comm_plan_->num_send_ranks > 0) ? &comm_plan_->send_counts_.host(0) : nullptr; + // int* send_dsp_ptr = (comm_plan_->num_send_ranks > 0) ? &comm_plan_->send_displs_.host(0) : nullptr; + // int* recv_cnt_ptr = (comm_plan_->num_recv_ranks > 0) ? &comm_plan_->recv_counts_.host(0) : nullptr; + // int* recv_dsp_ptr = (comm_plan_->num_recv_ranks > 0) ? &comm_plan_->recv_displs_.host(0) : nullptr; + + // Method that communicates the data between the ranks + // NOTE: This is a blocking communication operation, + // if you want to use non-blocking communication, you can use the following: MPI_Ineighbor_alltoallv + void communicate(){ + + this_array_.update_host(); + + fill_send_buffer(); + + MPI_Neighbor_alltoallv( + send_buffer_.host_pointer(), + send_counts_.host_pointer(), + send_displs_.host_pointer(), + mpi_type_map::value(), // MPI_TYPE + recv_buffer_.host_pointer(), + recv_counts_.host_pointer(), + recv_displs_.host_pointer(), + mpi_type_map::value(), // MPI_TYPE + comm_plan_->mpi_comm_graph); + + copy_recv_buffer(); + + this_array_.update_device(); + }; + + void set_values(const T& value){ + this_array_.set_values(value); + }; // Deconstructor virtual KOKKOS_INLINE_FUNCTION - ~MPIArrayKokkos (); -}; // End of MPIArrayKokkos - + ~MPICArrayKokkos (); +}; // End of MPIDArrayKokkos // Default constructor template -MPIArrayKokkos::MPIArrayKokkos() { - length_ = order_ = 0; - for (int i = 0; i < 7; i++) { - dims_[i] = 0; +MPICArrayKokkos::MPICArrayKokkos() + : this_array_(), stride_(1), length_(0), order_(0) { + for (int i = 0; i < 7; i++) { + dims_[i] = 0; + } } -} // Overloaded 1D constructor template -MPIArrayKokkos::MPIArrayKokkos(size_t dim0, const std::string& tag_string) { - +MPICArrayKokkos::MPICArrayKokkos(size_t dim0, const std::string& tag_string) + : stride_(1), length_(dim0), order_(1) { dims_[0] = dim0; - order_ = 1; - length_ = dim0; - this_array_ = TArray1D(tag_string, length_); - // Create host ViewCArray - host = ViewCArray (this_array_.h_view.data(), dim0); - set_mpi_type(); + this_array_ = DCArrayKokkos(dim0, tag_string); + host = ViewCArray (this_array_.host_pointer(), dim0); } // Overloaded 2D constructor template -MPIArrayKokkos::MPIArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string) { - +MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string) + : stride_(dim1), length_(dim0 * dim1), order_(2) { dims_[0] = dim0; dims_[1] = dim1; - order_ = 2; - length_ = (dim0 * dim1); - this_array_ = TArray1D(tag_string, length_); - // Create host ViewCArray - host = ViewCArray (this_array_.h_view.data(), dim0, dim1); - set_mpi_type(); + + this_array_ = DCArrayKokkos(dim0, dim1, tag_string); + host = ViewCArray (this_array_.host_pointer(), dim0, dim1); } +// Overloaded 3D constructor template -MPIArrayKokkos::MPIArrayKokkos(size_t dim0, size_t dim1, - size_t dim2, const std::string& tag_string) { - +MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string) + : stride_(dim1 * dim2), length_(dim0 * dim1 * dim2), order_(3) { dims_[0] = dim0; dims_[1] = dim1; dims_[2] = dim2; - order_ = 3; - length_ = (dim0 * dim1 * dim2); - this_array_ = TArray1D(tag_string, length_); - // Create host ViewCArray - host = ViewCArray (this_array_.h_view.data(), dim0, dim1, dim2); - set_mpi_type(); + this_array_ = DCArrayKokkos(dim0, dim1, dim2, tag_string); + host = ViewCArray (this_array_.host_pointer(), dim0, dim1, dim2); } +// Overloaded 4D constructor template -MPIArrayKokkos::MPIArrayKokkos(size_t dim0, size_t dim1, - size_t dim2, size_t dim3, const std::string& tag_string) { - +MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, const std::string& tag_string) + : stride_(dim1 * dim2 * dim3), length_(dim0 * dim1 * dim2 * dim3), order_(4) { dims_[0] = dim0; dims_[1] = dim1; dims_[2] = dim2; dims_[3] = dim3; - order_ = 4; - length_ = (dim0 * dim1 * dim2 * dim3); - this_array_ = TArray1D(tag_string, length_); - // Create host ViewCArray - host = ViewCArray (this_array_.h_view.data(), dim0, dim1, dim2, dim3); - set_mpi_type(); + this_array_ = DCArrayKokkos(dim0, dim1, dim2, dim3, tag_string); + host = ViewCArray (this_array_.host_pointer(), dim0, dim1, dim2, dim3); } +// Overloaded 5D constructor template -MPIArrayKokkos::MPIArrayKokkos(size_t dim0, size_t dim1, - size_t dim2, size_t dim3, - size_t dim4, const std::string& tag_string) { - +MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, const std::string& tag_string) + : stride_(dim1 * dim2 * dim3 * dim4), length_(dim0 * dim1 * dim2 * dim3 * dim4), order_(5) { dims_[0] = dim0; dims_[1] = dim1; dims_[2] = dim2; dims_[3] = dim3; dims_[4] = dim4; - order_ = 5; - length_ = (dim0 * dim1 * dim2 * dim3 * dim4); - this_array_ = TArray1D(tag_string, length_); - // Create host ViewCArray - host = ViewCArray (this_array_.h_view.data(), dim0, dim1, dim2, dim3, dim4); - set_mpi_type(); + this_array_ = DCArrayKokkos(dim0, dim1, dim2, dim3, dim4, tag_string); + host = ViewCArray (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4); } +// Overloaded 6D constructor template -MPIArrayKokkos::MPIArrayKokkos(size_t dim0, size_t dim1, - size_t dim2, size_t dim3, - size_t dim4, size_t dim5, const std::string& tag_string) { - +MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string) + : stride_(dim1 * dim2 * dim3 * dim4 * dim5), length_(dim0 * dim1 * dim2 * dim3 * dim4 * dim5), order_(6) { dims_[0] = dim0; dims_[1] = dim1; dims_[2] = dim2; dims_[3] = dim3; dims_[4] = dim4; dims_[5] = dim5; - order_ = 6; - length_ = (dim0 * dim1 * dim2 * dim3 * dim4 * dim5); - this_array_ = TArray1D(tag_string, length_); - // Create host ViewCArray - host = ViewCArray (this_array_.h_view.data(), dim0, dim1, dim2, dim3, dim4, dim5); - set_mpi_type(); + this_array_ = DCArrayKokkos(dim0, dim1, dim2, dim3, dim4, dim5, tag_string); + host = ViewCArray (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4, dim5); } +// Overloaded 7D constructor template -MPIArrayKokkos::MPIArrayKokkos(size_t dim0, size_t dim1, - size_t dim2, size_t dim3, - size_t dim4, size_t dim5, - size_t dim6, const std::string& tag_string) { - +MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, size_t dim6, const std::string& tag_string) + : stride_(dim1 * dim2 * dim3 * dim4 * dim5 * dim6), length_(dim0 * dim1 * dim2 * dim3 * dim4 * dim5 * dim6), order_(7) { dims_[0] = dim0; dims_[1] = dim1; dims_[2] = dim2; @@ -344,441 +417,163 @@ MPIArrayKokkos::MPIArrayKokkos(size_t dim0, siz dims_[4] = dim4; dims_[5] = dim5; dims_[6] = dim6; - order_ = 7; - length_ = (dim0 * dim1 * dim2 * dim3 * dim4 * dim5 * dim6); - this_array_ = TArray1D(tag_string, length_); - // Create host ViewCArray - host = ViewCArray (this_array_.h_view.data(), dim0, dim1, dim2, dim3, dim4, dim5, dim6); - set_mpi_type(); + this_array_ = DCArrayKokkos(dim0, dim1, dim2, dim3, dim4, dim5, dim6, tag_string); + host = ViewCArray (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4, dim5, dim6); } -template -void MPIArrayKokkos::set_mpi_type() { - if (typeid(T).name() == typeid(bool).name()) { - mpi_datatype_ = MPI_C_BOOL; - } - else if (typeid(T).name() == typeid(int).name()) { - mpi_datatype_ = MPI_INT; - } - else if (typeid(T).name() == typeid(long int).name()) { - mpi_datatype_ = MPI_LONG; - } - else if (typeid(T).name() == typeid(long long int).name()) { - mpi_datatype_ = MPI_LONG_LONG_INT; - } - else if (typeid(T).name() == typeid(float).name()) { - mpi_datatype_ = MPI_FLOAT; - } - else if (typeid(T).name() == typeid(double).name()) { - mpi_datatype_ = MPI_DOUBLE; - } - else { - printf("Your entered MPIArrayKokkos type is not a supported type for MPI communications and is being set to int\n"); - mpi_datatype_ = MPI_INT; - } -} template KOKKOS_INLINE_FUNCTION -T& MPIArrayKokkos::operator()(size_t i) const { - assert(order_ == 1 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 1D!"); - assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 1D!"); - return this_array_.d_view(i); +T& MPICArrayKokkos::operator()(size_t i) const { + assert(order_ == 1 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 1D!"); + assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 1D!"); + return this_array_(i); } template KOKKOS_INLINE_FUNCTION -T& MPIArrayKokkos::operator()(size_t i, size_t j) const { - assert(order_ == 2 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 2D!"); - assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 2D!"); - assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 2D!"); - return this_array_.d_view(j + (i * dims_[1])); +T& MPICArrayKokkos::operator()(size_t i, size_t j) const { + assert(order_ == 2 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 2D!"); + assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 2D!"); + assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 2D!"); + return this_array_(i, j); } template KOKKOS_INLINE_FUNCTION -T& MPIArrayKokkos::operator()(size_t i, size_t j, size_t k) const { - assert(order_ == 3 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 3D!"); - assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 3D!"); - assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 3D!"); - assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 3D!"); - return this_array_.d_view(k + (j * dims_[2]) - + (i * dims_[2] * dims_[1])); +T& MPICArrayKokkos::operator()(size_t i, size_t j, size_t k) const { + assert(order_ == 3 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 3D!"); + assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 3D!"); + assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 3D!"); + assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 3D!"); + return this_array_(i, j, k); } template KOKKOS_INLINE_FUNCTION -T& MPIArrayKokkos::operator()(size_t i, size_t j, size_t k, size_t l) const { - assert(order_ == 4 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 4D!"); - assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 4D!"); - assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 4D!"); - assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 4D!"); - assert(l >= 0 && l < dims_[3] && "l is out of bounds in MPIArrayKokkos 4D!"); - return this_array_.d_view(l + (k * dims_[3]) - + (j * dims_[3] * dims_[2]) - + (i * dims_[3] * dims_[2] * dims_[1])); +T& MPICArrayKokkos::operator()(size_t i, size_t j, size_t k, size_t l) const { + assert(order_ == 4 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 4D!"); + assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 4D!"); + assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 4D!"); + assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 4D!"); + assert(l < dims_[3] && "l is out of bounds in MPICArrayKokkos 4D!"); + return this_array_(i, j, k, l); } template KOKKOS_INLINE_FUNCTION -T& MPIArrayKokkos::operator()(size_t i, size_t j, size_t k, size_t l, - size_t m) const { - assert(order_ == 5 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 5D!"); - assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 5D!"); - assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 5D!"); - assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 5D!"); - assert(l >= 0 && l < dims_[3] && "l is out of bounds in MPIArrayKokkos 5D!"); - assert(m >= 0 && m < dims_[4] && "m is out of bounds in MPIArrayKokkos 5D!"); - return this_array_.d_view(m + (l * dims_[4]) - + (k * dims_[4] * dims_[3]) - + (j * dims_[4] * dims_[3] * dims_[2]) - + (i * dims_[4] * dims_[3] * dims_[2] * dims_[1])); +T& MPICArrayKokkos::operator()(size_t i, size_t j, size_t k, size_t l, size_t m) const { + assert(order_ == 5 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 5D!"); + assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 5D!"); + assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 5D!"); + assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 5D!"); + assert(l < dims_[3] && "l is out of bounds in MPICArrayKokkos 5D!"); + assert(m < dims_[4] && "m is out of bounds in MPICArrayKokkos 5D!"); + return this_array_(i, j, k, l, m); } template KOKKOS_INLINE_FUNCTION -T& MPIArrayKokkos::operator()(size_t i, size_t j, size_t k, size_t l, - size_t m, size_t n) const { - assert(order_ == 6 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 6D!"); - assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 6D!"); - assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 6D!"); - assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 6D!"); - assert(l >= 0 && l < dims_[3] && "l is out of bounds in MPIArrayKokkos 6D!"); - assert(m >= 0 && m < dims_[4] && "m is out of bounds in MPIArrayKokkos 6D!"); - assert(n >= 0 && n < dims_[5] && "n is out of bounds in MPIArrayKokkos 6D!"); - return this_array_.d_view(n + (m * dims_[5]) - + (l * dims_[5] * dims_[4]) - + (k * dims_[5] * dims_[4] * dims_[3]) - + (j * dims_[5] * dims_[4] * dims_[3] * dims_[2]) - + (i * dims_[5] * dims_[4] * dims_[3] * dims_[2] * dims_[1])); +T& MPICArrayKokkos::operator()(size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) const { + assert(order_ == 6 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 6D!"); + assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 6D!"); + assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 6D!"); + assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 6D!"); + assert(l < dims_[3] && "l is out of bounds in MPICArrayKokkos 6D!"); + assert(m < dims_[4] && "m is out of bounds in MPICArrayKokkos 6D!"); + assert(n < dims_[5] && "n is out of bounds in MPICArrayKokkos 6D!"); + return this_array_(i, j, k, l, m, n); } template KOKKOS_INLINE_FUNCTION -T& MPIArrayKokkos::operator()(size_t i, size_t j, size_t k, size_t l, - size_t m, size_t n, size_t o) const { - assert(order_ == 7 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 7D!"); - assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 7D!"); - assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 7D!"); - assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 7D!"); - assert(l >= 0 && l < dims_[3] && "l is out of bounds in MPIArrayKokkos 7D!"); - assert(m >= 0 && m < dims_[4] && "m is out of bounds in MPIArrayKokkos 7D!"); - assert(n >= 0 && n < dims_[5] && "n is out of bounds in MPIArrayKokkos 7D!"); - assert(o >= 0 && o < dims_[6] && "o is out of bounds in MPIArrayKokkos 7D!"); - return this_array_.d_view(o + (n * dims_[6]) - + (m * dims_[6] * dims_[5]) - + (l * dims_[6] * dims_[5] * dims_[4]) - + (k * dims_[6] * dims_[5] * dims_[4] * dims_[3]) - + (j * dims_[6] * dims_[5] * dims_[4] * dims_[3] * dims_[2]) - + (i * dims_[6] * dims_[5] * dims_[4] * dims_[3] * dims_[2] * dims_[1])); +T& MPICArrayKokkos::operator()(size_t i, size_t j, size_t k, size_t l, size_t m, size_t n, size_t o) const { + assert(order_ == 7 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 7D!"); + assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 7D!"); + assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 7D!"); + assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 7D!"); + assert(l < dims_[3] && "l is out of bounds in MPICArrayKokkos 7D!"); + assert(m < dims_[4] && "m is out of bounds in MPICArrayKokkos 7D!"); + assert(n < dims_[5] && "n is out of bounds in MPICArrayKokkos 7D!"); + assert(o < dims_[6] && "o is out of bounds in MPICArrayKokkos 7D!"); + return this_array_(i, j, k, l, m, n, o); } template KOKKOS_INLINE_FUNCTION -MPIArrayKokkos& MPIArrayKokkos::operator= (const MPIArrayKokkos& temp) { - - // Do nothing if the assignment is of the form x = x - if (this != &temp) { - for (int iter = 0; iter < temp.order_; iter++){ - dims_[iter] = temp.dims_[iter]; - } // end for - - order_ = temp.order_; - length_ = temp.length_; - this_array_ = temp.this_array_; - host = temp.host; - mpi_recv_rank_ = temp.mpi_recv_rank_; - mpi_tag_ = temp.mpi_tag_; - mpi_comm_ = temp.mpi_comm_; - mpi_status_ = temp.mpi_status_; - mpi_datatype_ = temp.mpi_datatype_; - mpi_request_ = temp.mpi_request_; - } - +MPICArrayKokkos& MPICArrayKokkos::operator=(const MPICArrayKokkos& temp) { + this_array_ = temp.this_array_; + host = temp.host; // Also copy the host ViewCArray + comm_plan_ = temp.comm_plan_; + send_buffer_ = temp.send_buffer_; + recv_buffer_ = temp.recv_buffer_; + stride_ = temp.stride_; return *this; } // Return size template KOKKOS_INLINE_FUNCTION -size_t MPIArrayKokkos::size() const { - return length_; +size_t MPICArrayKokkos::size() const { + return this_array_.size(); } template KOKKOS_INLINE_FUNCTION -size_t MPIArrayKokkos::extent() const { - return length_; +size_t MPICArrayKokkos::extent() const { + return this_array_.extent(); } template KOKKOS_INLINE_FUNCTION -size_t MPIArrayKokkos::dims(size_t i) const { - assert(i < order_ && "MPIArrayKokkos order (rank) does not match constructor, dim[i] does not exist!"); - assert(i >= 0 && dims_[i]>0 && "Access to MPIArrayKokkos dims is out of bounds!"); - return dims_[i]; +size_t MPICArrayKokkos::dims(size_t i) const { + assert(i < order_ && "MPICArrayKokkos order (rank) does not match constructor, dim[i] does not exist!"); + assert(dims_[i]>0 && "Access to MPICArrayKokkos dims is out of bounds!"); + return this_array_.dims(i); } template KOKKOS_INLINE_FUNCTION -size_t MPIArrayKokkos::order() const { - return order_; +size_t MPICArrayKokkos::order() const { + return this_array_.order(); } template KOKKOS_INLINE_FUNCTION -T* MPIArrayKokkos::device_pointer() const { - return this_array_.d_view.data(); +T* MPICArrayKokkos::device_pointer() const { + return this_array_.device_pointer(); } template KOKKOS_INLINE_FUNCTION -T* MPIArrayKokkos::host_pointer() const { - return this_array_.h_view.data(); +T* MPICArrayKokkos::host_pointer() const { + return this_array_.host_pointer(); } template KOKKOS_INLINE_FUNCTION -Kokkos::DualView MPIArrayKokkos::get_kokkos_dual_view() const { - return this_array_; -} - -template -void MPIArrayKokkos::update_host() { - - this_array_.template modify(); - this_array_.template sync(); +Kokkos::DualView MPICArrayKokkos::get_kokkos_dual_view() const { + return this_array_.get_kokkos_dual_view(); } template -void MPIArrayKokkos::update_device() { - - this_array_.template modify(); - this_array_.template sync(); +void MPICArrayKokkos::update_host() { + this_array_.update_host(); } -// a default setup, should not be used except for testing template -void MPIArrayKokkos::mpi_setup() { - mpi_recv_rank_ = 1; - mpi_tag_ = 99; - mpi_comm_ = MPI_COMM_WORLD; +void MPICArrayKokkos::update_device() { + this_array_.update_device(); } template -void MPIArrayKokkos::mpi_setup(int recv_rank) { - mpi_recv_rank_ = recv_rank; -} - -template -void MPIArrayKokkos::mpi_setup(int recv_rank, int tag) { - mpi_recv_rank_ = recv_rank; - mpi_tag_ = tag; -} - -template -void MPIArrayKokkos::mpi_setup(int recv_rank, int tag, MPI_Comm comm) { - mpi_recv_rank_ = recv_rank; - mpi_tag_ = tag; - mpi_comm_ = comm; -} - -template -void MPIArrayKokkos::mpi_set_rank(int recv_rank) { - mpi_recv_rank_ = recv_rank; -} - -template -void MPIArrayKokkos::mpi_set_tag(int tag) { - mpi_tag_ = tag; -} - -template -void MPIArrayKokkos::mpi_set_comm(MPI_Comm comm) { - mpi_comm_ = comm; -} - -template -int MPIArrayKokkos::get_rank() { - return mpi_recv_rank_; -} - -template -int MPIArrayKokkos::get_tag() { - return mpi_tag_; -} - -template -MPI_Comm MPIArrayKokkos::get_comm() { - return mpi_comm_; -} - -//MPI_Send wrapper -template -void MPIArrayKokkos::send(size_t count, int dest, int tag, MPI_Comm comm) { -#ifdef HAVE_GPU_AWARE_MPI - MPI_Send(device_pointer(), count, mpi_datatype_, dest, tag, comm); -#else - update_host(); - MPI_Send(host_pointer(), count, mpi_datatype_, dest, tag, comm); -#endif -} - -//MPI_Recv wrapper -template -void MPIArrayKokkos::recv(size_t count, int source, int tag, MPI_Comm comm) { -#ifdef HAVE_GPU_AWARE_MPI - MPI_Recv(device_pointer(), count, mpi_datatype_, source, tag, comm, &mpi_status_); -#else - MPI_Recv(host_pointer(), count, mpi_datatype_, source, tag, comm, &mpi_status_); - update_device(); -#endif -} - -//MPI_Send halo wrapper -template -void MPIArrayKokkos::halo_send() { -#ifdef HAVE_GPU_AWARE_MPI - MPI_Send(device_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_); -#else - update_host(); - MPI_Send(host_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_); -#endif -} - -//MPI_Recv halo wrapper -template -void MPIArrayKokkos::halo_recv() { -#ifdef HAVE_GPU_AWARE_MPI - MPI_Recv(device_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_status_); -#else - MPI_Recv(host_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_status_); - update_device(); -#endif -} - -//MPI_iSend halo wrapper -template -void MPIArrayKokkos::halo_isend() { -#ifdef HAVE_GPU_AWARE_MPI - MPI_Isend(device_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_request_); -#else - update_host(); - MPI_Isend(host_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_request_); -#endif -} - -//MPI_iRecv halo wrapper -template -void MPIArrayKokkos::halo_irecv() { -#ifdef HAVE_GPU_AWARE_MPI - MPI_Irecv(device_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_request_); -#else - MPI_Irecv(host_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_request_); -#endif -} - -//MPI_Bcast wrapper -template -void MPIArrayKokkos::broadcast(size_t count, int root, MPI_Comm comm) { -#ifdef HAVE_GPU_AWARE_MPI - MPI_Bcast(device_pointer(), count, mpi_datatype_, root, comm); -#else - update_host(); - MPI_Bcast(host_pointer(), count, mpi_datatype_, root, comm); - update_device(); -#endif -} - -//MPI_Scatter wrapper -template -void MPIArrayKokkos::scatter(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm) { -#ifdef HAVE_GPU_AWARE_MPI - MPI_Scatter(device_pointer(), send_count, mpi_datatype_, recv_buffer.device_pointer(), recv_count, mpi_datatype_, root, comm); -#else - update_host(); - MPI_Scatter(host_pointer(), send_count, mpi_datatype_, recv_buffer.host_pointer(), recv_count, mpi_datatype_, root, comm); - recv_buffer.update_device(); -#endif -} - -//MPI_Gather wrapper -template -void MPIArrayKokkos::gather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm) { -#ifdef HAVE_GPU_AWARE_MPI - MPI_Gather(device_pointer(), send_count, mpi_datatype_, recv_buffer.device_pointer(), recv_count, mpi_datatype_, root, comm); -#else - update_host(); - MPI_Gather(host_pointer(), send_count, mpi_datatype_, recv_buffer.host_pointer(), recv_count, mpi_datatype_, root, comm); - recv_buffer.update_device(); -#endif -} - -//MPI_AllGather wrapper -template -void MPIArrayKokkos::allgather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, MPI_Comm comm) { -#ifdef HAVE_GPU_AWARE_MPI - MPI_Allgather(device_pointer(), send_count, mpi_datatype_, recv_buffer.device_pointer(), recv_count, mpi_datatype_, comm); -#else - update_host(); - MPI_Allgather(host_pointer(), send_count, mpi_datatype_, recv_buffer.host_pointer(), recv_count, mpi_datatype_, comm); - recv_buffer.update_device(); -#endif -} - -//MPI_Isend wrapper -template -void MPIArrayKokkos::isend(size_t count, int dest, int tag, MPI_Comm comm) { -#ifdef HAVE_GPU_AWARE_MPI - MPI_Isend(device_pointer(), count, mpi_datatype_, dest, tag, comm, &mpi_request_); -#else - update_host(); - MPI_Isend(host_pointer(), count, mpi_datatype_, dest, tag, comm, &mpi_request_); -#endif -} - -//MPI_Irecv wrapper -template -void MPIArrayKokkos::irecv(size_t count, int source, int tag, MPI_Comm comm) { -#ifdef HAVE_GPU_AWARE_MPI - MPI_Irecv(device_pointer(), count, mpi_datatype_, source, tag, comm, &mpi_request_); -#else - MPI_Irecv(host_pointer(), count, mpi_datatype_, source, tag, comm, &mpi_request_); -#endif -} - -//MPI_Wait wrapper for the sender -template -void MPIArrayKokkos::wait_send() { - MPI_Wait(&mpi_request_, &mpi_status_); -} +KOKKOS_INLINE_FUNCTION +MPICArrayKokkos::~MPICArrayKokkos() { -//MPI_Wait wrapper for the receiver -template -void MPIArrayKokkos::wait_recv() { - MPI_Wait(&mpi_request_, &mpi_status_); -#ifndef HAVE_GPU_AWARE_MPI - update_device(); -#endif } -//MPI_Barrier wrapper -//template -//void MPIArrayKokkos::barrier(MPI_Comm comm) { -// MPI_Barrier(comm); -//} - -template -KOKKOS_INLINE_FUNCTION -MPIArrayKokkos::~MPIArrayKokkos() {} - -//////////////////////////////////////////////////////////////////////////////// -// End of MPIArrayKokkos -//////////////////////////////////////////////////////////////////////////////// +} // end namespace mtr -} // end namespace #endif // end if have MPI - -#endif // MPI_TYPES_H - +#endif // end if MPICARRAYKOKKOS_H \ No newline at end of file diff --git a/src/include/mpi_types_old.h b/src/include/mpi_types_old.h new file mode 100644 index 00000000..b10a57fc --- /dev/null +++ b/src/include/mpi_types_old.h @@ -0,0 +1,784 @@ +#ifndef MPI_TYPES_H +#define MPI_TYPES_H +/********************************************************************************************** + © 2020. Triad National Security, LLC. All rights reserved. + This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos + National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S. + Department of Energy/National Nuclear Security Administration. All rights in the program are + reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear + Security Administration. The Government is granted for itself and others acting on its behalf a + nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare + derivative works, distribute copies to the public, perform publicly and display publicly, and + to permit others to do so. + This program is open source under the BSD-3 License. + Redistribution and use in source and binary forms, with or without modification, are permitted + provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, this list of + conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, this list of + conditions and the following disclaimer in the documentation and/or other materials + provided with the distribution. + + 3. Neither the name of the copyright holder nor the names of its contributors may be used + to endorse or promote products derived from this software without specific prior + written permission. + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + **********************************************************************************************/ + +#include "host_types.h" +#include "kokkos_types.h" +#include +#ifdef HAVE_MPI +#include + +namespace mtr +{ + +///////////////////////// +// MPIArrayKokkos: Dual type for managing distributed data on both CPU and GPU. +///////////////////////// +template +class MPIArrayKokkos { + + // this is manage + using TArray1D = Kokkos::DualView ; + +protected: + size_t dims_[7]; + size_t length_; + size_t order_; // tensor order (rank) + int mpi_recv_rank_; + int mpi_tag_; + MPI_Comm mpi_comm_; + MPI_Status mpi_status_; + MPI_Datatype mpi_datatype_; + MPI_Request mpi_request_; + TArray1D this_array_; + + void set_mpi_type(); + +public: + // Data member to access host view + ViewCArray host; + + MPIArrayKokkos(); + + MPIArrayKokkos(size_t dim0, const std::string& tag_string = DEFAULTSTRINGARRAY); + + MPIArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY); + + MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY); + + MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2, + size_t dim3, const std::string& tag_string = DEFAULTSTRINGARRAY); + + MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2, + size_t dim3, size_t dim4, const std::string& tag_string = DEFAULTSTRINGARRAY); + + MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2, + size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string = DEFAULTSTRINGARRAY); + + MPIArrayKokkos(size_t dim0, size_t dim1, size_t dim2, + size_t dim3, size_t dim4, size_t dim5, + size_t dim6, const std::string& tag_string = DEFAULTSTRINGARRAY); + + // These functions can setup the data needed for halo send/receives + // Not necessary for standard MPI comms + void mpi_setup(); + + void mpi_setup(int recv_rank); + + void mpi_setup(int recv_rank, int tag); + + void mpi_setup(int recv_rank, int tag, MPI_Comm comm); + + void mpi_set_rank(int recv_rank); + + void mpi_set_tag(int tag); + + void mpi_set_comm(MPI_Comm comm); + + int get_rank(); + + int get_tag(); + + MPI_Comm get_comm(); + + KOKKOS_INLINE_FUNCTION + T& operator()(size_t i) const; + + KOKKOS_INLINE_FUNCTION + T& operator()(size_t i, size_t j) const; + + KOKKOS_INLINE_FUNCTION + T& operator()(size_t i, size_t j, size_t k) const; + + KOKKOS_INLINE_FUNCTION + T& operator()(size_t i, size_t j, size_t k, size_t l) const; + + KOKKOS_INLINE_FUNCTION + T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m) const; + + KOKKOS_INLINE_FUNCTION + T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m, + size_t n) const; + + KOKKOS_INLINE_FUNCTION + T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m, + size_t n, size_t o) const; + + KOKKOS_INLINE_FUNCTION + MPIArrayKokkos& operator=(const MPIArrayKokkos& temp); + + // GPU Method + // Method that returns size + KOKKOS_INLINE_FUNCTION + size_t size() const; + + // Host Method + // Method that returns size + KOKKOS_INLINE_FUNCTION + size_t extent() const; + + KOKKOS_INLINE_FUNCTION + size_t dims(size_t i) const; + + KOKKOS_INLINE_FUNCTION + size_t order() const; + + // Method returns the raw device pointer of the Kokkos DualView + KOKKOS_INLINE_FUNCTION + T* device_pointer() const; + + // Method returns the raw host pointer of the Kokkos DualView + KOKKOS_INLINE_FUNCTION + T* host_pointer() const; + + // Method returns kokkos dual view + KOKKOS_INLINE_FUNCTION + TArray1D get_kokkos_dual_view() const; + + // Method that update host view + void update_host(); + + // Method that update device view + void update_device(); + + // MPI send wrapper + void send(size_t count, int dest, int tag, MPI_Comm comm); + + // MPI recieve wrapper + void recv(size_t count, int dest, int tag, MPI_Comm comm); + + // MPI broadcast wrapper + void broadcast(size_t count, int root, MPI_Comm comm); + + // MPI scatter wrapper + void scatter(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm); + + // MPI gather wrapper + void gather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm); + + // MPI allgather wrapper + void allgather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, MPI_Comm comm); + + // MPI send wrapper + void isend(size_t count, int dest, int tag, MPI_Comm comm); + + // MPI recieve wrapper + void irecv(size_t count, int dest, int tag, MPI_Comm comm); + + // MPI wait wrapper for sender + void wait_send(); + + // MPI wait wrapper for receiver + void wait_recv(); + + // MPI barrier wrapper + //void barrier(MPI_Comm comm); + + // MPI send wrapper + void halo_send(); + + // MPI recieve wrapper + void halo_recv(); + + // MPI send wrapper + void halo_isend(); + + // MPI recieve wrapper + void halo_irecv(); + + // Deconstructor + virtual KOKKOS_INLINE_FUNCTION + ~MPIArrayKokkos (); +}; // End of MPIArrayKokkos + + +// Default constructor +template +MPIArrayKokkos::MPIArrayKokkos() { + length_ = order_ = 0; + for (int i = 0; i < 7; i++) { + dims_[i] = 0; + } +} + +// Overloaded 1D constructor +template +MPIArrayKokkos::MPIArrayKokkos(size_t dim0, const std::string& tag_string) { + + dims_[0] = dim0; + order_ = 1; + length_ = dim0; + this_array_ = TArray1D(tag_string, length_); + // Create host ViewCArray + host = ViewCArray (this_array_.h_view.data(), dim0); + set_mpi_type(); +} + +// Overloaded 2D constructor +template +MPIArrayKokkos::MPIArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string) { + + dims_[0] = dim0; + dims_[1] = dim1; + order_ = 2; + length_ = (dim0 * dim1); + this_array_ = TArray1D(tag_string, length_); + // Create host ViewCArray + host = ViewCArray (this_array_.h_view.data(), dim0, dim1); + set_mpi_type(); +} + +template +MPIArrayKokkos::MPIArrayKokkos(size_t dim0, size_t dim1, + size_t dim2, const std::string& tag_string) { + + dims_[0] = dim0; + dims_[1] = dim1; + dims_[2] = dim2; + order_ = 3; + length_ = (dim0 * dim1 * dim2); + this_array_ = TArray1D(tag_string, length_); + // Create host ViewCArray + host = ViewCArray (this_array_.h_view.data(), dim0, dim1, dim2); + set_mpi_type(); +} + +template +MPIArrayKokkos::MPIArrayKokkos(size_t dim0, size_t dim1, + size_t dim2, size_t dim3, const std::string& tag_string) { + + dims_[0] = dim0; + dims_[1] = dim1; + dims_[2] = dim2; + dims_[3] = dim3; + order_ = 4; + length_ = (dim0 * dim1 * dim2 * dim3); + this_array_ = TArray1D(tag_string, length_); + // Create host ViewCArray + host = ViewCArray (this_array_.h_view.data(), dim0, dim1, dim2, dim3); + set_mpi_type(); +} + +template +MPIArrayKokkos::MPIArrayKokkos(size_t dim0, size_t dim1, + size_t dim2, size_t dim3, + size_t dim4, const std::string& tag_string) { + + dims_[0] = dim0; + dims_[1] = dim1; + dims_[2] = dim2; + dims_[3] = dim3; + dims_[4] = dim4; + order_ = 5; + length_ = (dim0 * dim1 * dim2 * dim3 * dim4); + this_array_ = TArray1D(tag_string, length_); + // Create host ViewCArray + host = ViewCArray (this_array_.h_view.data(), dim0, dim1, dim2, dim3, dim4); + set_mpi_type(); +} + +template +MPIArrayKokkos::MPIArrayKokkos(size_t dim0, size_t dim1, + size_t dim2, size_t dim3, + size_t dim4, size_t dim5, const std::string& tag_string) { + + dims_[0] = dim0; + dims_[1] = dim1; + dims_[2] = dim2; + dims_[3] = dim3; + dims_[4] = dim4; + dims_[5] = dim5; + order_ = 6; + length_ = (dim0 * dim1 * dim2 * dim3 * dim4 * dim5); + this_array_ = TArray1D(tag_string, length_); + // Create host ViewCArray + host = ViewCArray (this_array_.h_view.data(), dim0, dim1, dim2, dim3, dim4, dim5); + set_mpi_type(); +} + +template +MPIArrayKokkos::MPIArrayKokkos(size_t dim0, size_t dim1, + size_t dim2, size_t dim3, + size_t dim4, size_t dim5, + size_t dim6, const std::string& tag_string) { + + dims_[0] = dim0; + dims_[1] = dim1; + dims_[2] = dim2; + dims_[3] = dim3; + dims_[4] = dim4; + dims_[5] = dim5; + dims_[6] = dim6; + order_ = 7; + length_ = (dim0 * dim1 * dim2 * dim3 * dim4 * dim5 * dim6); + this_array_ = TArray1D(tag_string, length_); + // Create host ViewCArray + host = ViewCArray (this_array_.h_view.data(), dim0, dim1, dim2, dim3, dim4, dim5, dim6); + set_mpi_type(); +} + +template +void MPIArrayKokkos::set_mpi_type() { + if (typeid(T).name() == typeid(bool).name()) { + mpi_datatype_ = MPI_C_BOOL; + } + else if (typeid(T).name() == typeid(int).name()) { + mpi_datatype_ = MPI_INT; + } + else if (typeid(T).name() == typeid(long int).name()) { + mpi_datatype_ = MPI_LONG; + } + else if (typeid(T).name() == typeid(long long int).name()) { + mpi_datatype_ = MPI_LONG_LONG_INT; + } + else if (typeid(T).name() == typeid(float).name()) { + mpi_datatype_ = MPI_FLOAT; + } + else if (typeid(T).name() == typeid(double).name()) { + mpi_datatype_ = MPI_DOUBLE; + } + else { + printf("Your entered MPIArrayKokkos type is not a supported type for MPI communications and is being set to int\n"); + mpi_datatype_ = MPI_INT; + } +} + +template +KOKKOS_INLINE_FUNCTION +T& MPIArrayKokkos::operator()(size_t i) const { + assert(order_ == 1 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 1D!"); + assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 1D!"); + return this_array_.d_view(i); +} + +template +KOKKOS_INLINE_FUNCTION +T& MPIArrayKokkos::operator()(size_t i, size_t j) const { + assert(order_ == 2 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 2D!"); + assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 2D!"); + assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 2D!"); + return this_array_.d_view(j + (i * dims_[1])); +} + +template +KOKKOS_INLINE_FUNCTION +T& MPIArrayKokkos::operator()(size_t i, size_t j, size_t k) const { + assert(order_ == 3 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 3D!"); + assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 3D!"); + assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 3D!"); + assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 3D!"); + return this_array_.d_view(k + (j * dims_[2]) + + (i * dims_[2] * dims_[1])); +} + +template +KOKKOS_INLINE_FUNCTION +T& MPIArrayKokkos::operator()(size_t i, size_t j, size_t k, size_t l) const { + assert(order_ == 4 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 4D!"); + assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 4D!"); + assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 4D!"); + assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 4D!"); + assert(l >= 0 && l < dims_[3] && "l is out of bounds in MPIArrayKokkos 4D!"); + return this_array_.d_view(l + (k * dims_[3]) + + (j * dims_[3] * dims_[2]) + + (i * dims_[3] * dims_[2] * dims_[1])); +} + +template +KOKKOS_INLINE_FUNCTION +T& MPIArrayKokkos::operator()(size_t i, size_t j, size_t k, size_t l, + size_t m) const { + assert(order_ == 5 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 5D!"); + assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 5D!"); + assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 5D!"); + assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 5D!"); + assert(l >= 0 && l < dims_[3] && "l is out of bounds in MPIArrayKokkos 5D!"); + assert(m >= 0 && m < dims_[4] && "m is out of bounds in MPIArrayKokkos 5D!"); + return this_array_.d_view(m + (l * dims_[4]) + + (k * dims_[4] * dims_[3]) + + (j * dims_[4] * dims_[3] * dims_[2]) + + (i * dims_[4] * dims_[3] * dims_[2] * dims_[1])); +} + +template +KOKKOS_INLINE_FUNCTION +T& MPIArrayKokkos::operator()(size_t i, size_t j, size_t k, size_t l, + size_t m, size_t n) const { + assert(order_ == 6 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 6D!"); + assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 6D!"); + assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 6D!"); + assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 6D!"); + assert(l >= 0 && l < dims_[3] && "l is out of bounds in MPIArrayKokkos 6D!"); + assert(m >= 0 && m < dims_[4] && "m is out of bounds in MPIArrayKokkos 6D!"); + assert(n >= 0 && n < dims_[5] && "n is out of bounds in MPIArrayKokkos 6D!"); + return this_array_.d_view(n + (m * dims_[5]) + + (l * dims_[5] * dims_[4]) + + (k * dims_[5] * dims_[4] * dims_[3]) + + (j * dims_[5] * dims_[4] * dims_[3] * dims_[2]) + + (i * dims_[5] * dims_[4] * dims_[3] * dims_[2] * dims_[1])); +} + +template +KOKKOS_INLINE_FUNCTION +T& MPIArrayKokkos::operator()(size_t i, size_t j, size_t k, size_t l, + size_t m, size_t n, size_t o) const { + assert(order_ == 7 && "Tensor order (rank) does not match constructor in MPIArrayKokkos 7D!"); + assert(i >= 0 && i < dims_[0] && "i is out of bounds in MPIArrayKokkos 7D!"); + assert(j >= 0 && j < dims_[1] && "j is out of bounds in MPIArrayKokkos 7D!"); + assert(k >= 0 && k < dims_[2] && "k is out of bounds in MPIArrayKokkos 7D!"); + assert(l >= 0 && l < dims_[3] && "l is out of bounds in MPIArrayKokkos 7D!"); + assert(m >= 0 && m < dims_[4] && "m is out of bounds in MPIArrayKokkos 7D!"); + assert(n >= 0 && n < dims_[5] && "n is out of bounds in MPIArrayKokkos 7D!"); + assert(o >= 0 && o < dims_[6] && "o is out of bounds in MPIArrayKokkos 7D!"); + return this_array_.d_view(o + (n * dims_[6]) + + (m * dims_[6] * dims_[5]) + + (l * dims_[6] * dims_[5] * dims_[4]) + + (k * dims_[6] * dims_[5] * dims_[4] * dims_[3]) + + (j * dims_[6] * dims_[5] * dims_[4] * dims_[3] * dims_[2]) + + (i * dims_[6] * dims_[5] * dims_[4] * dims_[3] * dims_[2] * dims_[1])); +} + +template +KOKKOS_INLINE_FUNCTION +MPIArrayKokkos& MPIArrayKokkos::operator= (const MPIArrayKokkos& temp) { + + // Do nothing if the assignment is of the form x = x + if (this != &temp) { + for (int iter = 0; iter < temp.order_; iter++){ + dims_[iter] = temp.dims_[iter]; + } // end for + + order_ = temp.order_; + length_ = temp.length_; + this_array_ = temp.this_array_; + host = temp.host; + mpi_recv_rank_ = temp.mpi_recv_rank_; + mpi_tag_ = temp.mpi_tag_; + mpi_comm_ = temp.mpi_comm_; + mpi_status_ = temp.mpi_status_; + mpi_datatype_ = temp.mpi_datatype_; + mpi_request_ = temp.mpi_request_; + } + + return *this; +} + +// Return size +template +KOKKOS_INLINE_FUNCTION +size_t MPIArrayKokkos::size() const { + return length_; +} + +template +KOKKOS_INLINE_FUNCTION +size_t MPIArrayKokkos::extent() const { + return length_; +} + +template +KOKKOS_INLINE_FUNCTION +size_t MPIArrayKokkos::dims(size_t i) const { + assert(i < order_ && "MPIArrayKokkos order (rank) does not match constructor, dim[i] does not exist!"); + assert(i >= 0 && dims_[i]>0 && "Access to MPIArrayKokkos dims is out of bounds!"); + return dims_[i]; +} + +template +KOKKOS_INLINE_FUNCTION +size_t MPIArrayKokkos::order() const { + return order_; +} + +template +KOKKOS_INLINE_FUNCTION +T* MPIArrayKokkos::device_pointer() const { + return this_array_.d_view.data(); +} + +template +KOKKOS_INLINE_FUNCTION +T* MPIArrayKokkos::host_pointer() const { + return this_array_.h_view.data(); +} + +template +KOKKOS_INLINE_FUNCTION +Kokkos::DualView MPIArrayKokkos::get_kokkos_dual_view() const { + return this_array_; +} + +template +void MPIArrayKokkos::update_host() { + + this_array_.template modify(); + this_array_.template sync(); +} + +template +void MPIArrayKokkos::update_device() { + + this_array_.template modify(); + this_array_.template sync(); +} + +// a default setup, should not be used except for testing +template +void MPIArrayKokkos::mpi_setup() { + mpi_recv_rank_ = 1; + mpi_tag_ = 99; + mpi_comm_ = MPI_COMM_WORLD; +} + +template +void MPIArrayKokkos::mpi_setup(int recv_rank) { + mpi_recv_rank_ = recv_rank; +} + +template +void MPIArrayKokkos::mpi_setup(int recv_rank, int tag) { + mpi_recv_rank_ = recv_rank; + mpi_tag_ = tag; +} + +template +void MPIArrayKokkos::mpi_setup(int recv_rank, int tag, MPI_Comm comm) { + mpi_recv_rank_ = recv_rank; + mpi_tag_ = tag; + mpi_comm_ = comm; +} + +template +void MPIArrayKokkos::mpi_set_rank(int recv_rank) { + mpi_recv_rank_ = recv_rank; +} + +template +void MPIArrayKokkos::mpi_set_tag(int tag) { + mpi_tag_ = tag; +} + +template +void MPIArrayKokkos::mpi_set_comm(MPI_Comm comm) { + mpi_comm_ = comm; +} + +template +int MPIArrayKokkos::get_rank() { + return mpi_recv_rank_; +} + +template +int MPIArrayKokkos::get_tag() { + return mpi_tag_; +} + +template +MPI_Comm MPIArrayKokkos::get_comm() { + return mpi_comm_; +} + +//MPI_Send wrapper +template +void MPIArrayKokkos::send(size_t count, int dest, int tag, MPI_Comm comm) { +#ifdef HAVE_GPU_AWARE_MPI + MPI_Send(device_pointer(), count, mpi_datatype_, dest, tag, comm); +#else + update_host(); + MPI_Send(host_pointer(), count, mpi_datatype_, dest, tag, comm); +#endif +} + +//MPI_Recv wrapper +template +void MPIArrayKokkos::recv(size_t count, int source, int tag, MPI_Comm comm) { +#ifdef HAVE_GPU_AWARE_MPI + MPI_Recv(device_pointer(), count, mpi_datatype_, source, tag, comm, &mpi_status_); +#else + MPI_Recv(host_pointer(), count, mpi_datatype_, source, tag, comm, &mpi_status_); + update_device(); +#endif +} + +//MPI_Send halo wrapper +template +void MPIArrayKokkos::halo_send() { +#ifdef HAVE_GPU_AWARE_MPI + MPI_Send(device_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_); +#else + update_host(); + MPI_Send(host_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_); +#endif +} + +//MPI_Recv halo wrapper +template +void MPIArrayKokkos::halo_recv() { +#ifdef HAVE_GPU_AWARE_MPI + MPI_Recv(device_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_status_); +#else + MPI_Recv(host_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_status_); + update_device(); +#endif +} + +//MPI_iSend halo wrapper +template +void MPIArrayKokkos::halo_isend() { +#ifdef HAVE_GPU_AWARE_MPI + MPI_Isend(device_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_request_); +#else + update_host(); + MPI_Isend(host_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_request_); +#endif +} + +//MPI_iRecv halo wrapper +template +void MPIArrayKokkos::halo_irecv() { +#ifdef HAVE_GPU_AWARE_MPI + MPI_Irecv(device_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_request_); +#else + MPI_Irecv(host_pointer(), size(), mpi_datatype_, mpi_recv_rank_, mpi_tag_, mpi_comm_, &mpi_request_); +#endif +} + +//MPI_Bcast wrapper +template +void MPIArrayKokkos::broadcast(size_t count, int root, MPI_Comm comm) { +#ifdef HAVE_GPU_AWARE_MPI + MPI_Bcast(device_pointer(), count, mpi_datatype_, root, comm); +#else + update_host(); + MPI_Bcast(host_pointer(), count, mpi_datatype_, root, comm); + update_device(); +#endif +} + +//MPI_Scatter wrapper +template +void MPIArrayKokkos::scatter(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm) { +#ifdef HAVE_GPU_AWARE_MPI + MPI_Scatter(device_pointer(), send_count, mpi_datatype_, recv_buffer.device_pointer(), recv_count, mpi_datatype_, root, comm); +#else + update_host(); + MPI_Scatter(host_pointer(), send_count, mpi_datatype_, recv_buffer.host_pointer(), recv_count, mpi_datatype_, root, comm); + recv_buffer.update_device(); +#endif +} + +//MPI_Gather wrapper +template +void MPIArrayKokkos::gather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, int root, MPI_Comm comm) { +#ifdef HAVE_GPU_AWARE_MPI + MPI_Gather(device_pointer(), send_count, mpi_datatype_, recv_buffer.device_pointer(), recv_count, mpi_datatype_, root, comm); +#else + update_host(); + MPI_Gather(host_pointer(), send_count, mpi_datatype_, recv_buffer.host_pointer(), recv_count, mpi_datatype_, root, comm); + recv_buffer.update_device(); +#endif +} + +//MPI_AllGather wrapper +template +void MPIArrayKokkos::allgather(size_t send_count, MPIArrayKokkos recv_buffer, size_t recv_count, MPI_Comm comm) { +#ifdef HAVE_GPU_AWARE_MPI + MPI_Allgather(device_pointer(), send_count, mpi_datatype_, recv_buffer.device_pointer(), recv_count, mpi_datatype_, comm); +#else + update_host(); + MPI_Allgather(host_pointer(), send_count, mpi_datatype_, recv_buffer.host_pointer(), recv_count, mpi_datatype_, comm); + recv_buffer.update_device(); +#endif +} + +//MPI_Isend wrapper +template +void MPIArrayKokkos::isend(size_t count, int dest, int tag, MPI_Comm comm) { +#ifdef HAVE_GPU_AWARE_MPI + MPI_Isend(device_pointer(), count, mpi_datatype_, dest, tag, comm, &mpi_request_); +#else + update_host(); + MPI_Isend(host_pointer(), count, mpi_datatype_, dest, tag, comm, &mpi_request_); +#endif +} + +//MPI_Irecv wrapper +template +void MPIArrayKokkos::irecv(size_t count, int source, int tag, MPI_Comm comm) { +#ifdef HAVE_GPU_AWARE_MPI + MPI_Irecv(device_pointer(), count, mpi_datatype_, source, tag, comm, &mpi_request_); +#else + MPI_Irecv(host_pointer(), count, mpi_datatype_, source, tag, comm, &mpi_request_); +#endif +} + +//MPI_Wait wrapper for the sender +template +void MPIArrayKokkos::wait_send() { + MPI_Wait(&mpi_request_, &mpi_status_); +} + +//MPI_Wait wrapper for the receiver +template +void MPIArrayKokkos::wait_recv() { + MPI_Wait(&mpi_request_, &mpi_status_); +#ifndef HAVE_GPU_AWARE_MPI + update_device(); +#endif +} + +//MPI_Barrier wrapper +//template +//void MPIArrayKokkos::barrier(MPI_Comm comm) { +// MPI_Barrier(comm); +//} + +template +KOKKOS_INLINE_FUNCTION +MPIArrayKokkos::~MPIArrayKokkos() {} + +//////////////////////////////////////////////////////////////////////////////// +// End of MPIArrayKokkos +//////////////////////////////////////////////////////////////////////////////// + +} // end namespace + +#endif // end if have MPI + +#endif // MPI_TYPES_H + From 0cd362080001f2448e48f259e7451c17f739aa27 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Mon, 10 Nov 2025 16:35:13 -0600 Subject: [PATCH 30/52] ENH: Moving mpi type to base MATAR --- examples/mesh_decomp/mpi_type.h | 579 -------------------------------- 1 file changed, 579 deletions(-) delete mode 100644 examples/mesh_decomp/mpi_type.h diff --git a/examples/mesh_decomp/mpi_type.h b/examples/mesh_decomp/mpi_type.h deleted file mode 100644 index be7984d5..00000000 --- a/examples/mesh_decomp/mpi_type.h +++ /dev/null @@ -1,579 +0,0 @@ -#ifndef MPICARRAYKOKKOS_H -#define MPICARRAYKOKKOS_H - -// #ifdef HAVE_MPI -#include -#include "matar.h" -#include "communication_plan.h" - -namespace mtr -{ - -// Type trait to map C++ types to MPI_Datatype -template -struct mpi_type_map { - static MPI_Datatype value() { - static_assert(sizeof(T) == 0, "Unsupported type for MPI communication"); - return MPI_DATATYPE_NULL; - } -}; - -// Specializations for common types -template <> -struct mpi_type_map { - static MPI_Datatype value() { return MPI_INT; } -}; - -template <> -struct mpi_type_map { - static MPI_Datatype value() { return MPI_LONG; } -}; - -template <> -struct mpi_type_map { - static MPI_Datatype value() { return MPI_LONG_LONG; } -}; - -template <> -struct mpi_type_map { - static MPI_Datatype value() { return MPI_UNSIGNED; } -}; - -template <> -struct mpi_type_map { - static MPI_Datatype value() { return MPI_UNSIGNED_LONG; } -}; - -template <> -struct mpi_type_map { - static MPI_Datatype value() { return MPI_FLOAT; } -}; - -template <> -struct mpi_type_map { - static MPI_Datatype value() { return MPI_DOUBLE; } -}; - -template <> -struct mpi_type_map { - static MPI_Datatype value() { return MPI_CHAR; } -}; - -template <> -struct mpi_type_map { - static MPI_Datatype value() { return MPI_UNSIGNED_CHAR; } -}; - -template <> -struct mpi_type_map { - static MPI_Datatype value() { return MPI_C_BOOL; } -}; - - -///////////////////////// -// MPICArrayKokkos: Type for managing distributed data on both CPU and GPU. -///////////////////////// -template -class MPICArrayKokkos { - - // Dual view for managing data on both CPU and GPU - DCArrayKokkos this_array_; - - DCArrayKokkos send_buffer_; - DCArrayKokkos recv_buffer_; - -protected: - size_t dims_[7]; - size_t length_; - size_t order_; // tensor order (rank) - - MPI_Comm mpi_comm_; - MPI_Status mpi_status_; - MPI_Datatype mpi_datatype_; - MPI_Request mpi_request_; - - - // --- Ghost Communication Support --- - CommunicationPlan* comm_plan_; // Pointer to shared communication plan - - - DCArrayKokkos send_counts_; // [size: num_send_ranks] Number of items to send to each rank - DCArrayKokkos recv_counts_; // [size: num_recv_ranks] Number of items to receive from each rank - DCArrayKokkos send_displs_; // [size: num_send_ranks] Starting index of items to send to each rank - DCArrayKokkos recv_displs_; // [size: num_recv_ranks] Starting index of items to receive from each rank - - size_t stride_; // [size: num_dims] Number of contiguous values per first index element - - - DRaggedRightArrayKokkos send_indices_; // [size: num_send_ranks, num_items_to_send_by_rank] Indices of items to send to each rank - DRaggedRightArrayKokkos recv_indices_; // [size: num_recv_ranks, num_items_to_recv_by_rank] Indices of items to receive from each rank - - - size_t num_owned_; // Number of owned items (nodes/elements) - size_t num_ghost_; // Number of ghost items (nodes/elements) - -public: - // Data member to access host view - ViewCArray host; - - - // Note, consider this for sending blocks without dealing with stride_ - // MPI_Datatype vector_type; - // MPI_Type_contiguous(stride_, mpi_type_map::value(), &vector_type); - // MPI_Type_commit(&vector_type); - - MPICArrayKokkos(); - - MPICArrayKokkos(size_t dim0, const std::string& tag_string = DEFAULTSTRINGARRAY); - - MPICArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string = DEFAULTSTRINGARRAY); - - MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string = DEFAULTSTRINGARRAY); - - MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, - size_t dim3, const std::string& tag_string = DEFAULTSTRINGARRAY); - - MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, - size_t dim3, size_t dim4, const std::string& tag_string = DEFAULTSTRINGARRAY); - - MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, - size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string = DEFAULTSTRINGARRAY); - - MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, - size_t dim3, size_t dim4, size_t dim5, - size_t dim6, const std::string& tag_string = DEFAULTSTRINGARRAY); - - - - KOKKOS_INLINE_FUNCTION - T& operator()(size_t i) const; - - KOKKOS_INLINE_FUNCTION - T& operator()(size_t i, size_t j) const; - - KOKKOS_INLINE_FUNCTION - T& operator()(size_t i, size_t j, size_t k) const; - - KOKKOS_INLINE_FUNCTION - T& operator()(size_t i, size_t j, size_t k, size_t l) const; - - KOKKOS_INLINE_FUNCTION - T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m) const; - - KOKKOS_INLINE_FUNCTION - T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m, - size_t n) const; - - KOKKOS_INLINE_FUNCTION - T& operator()(size_t i, size_t j, size_t k, size_t l, size_t m, - size_t n, size_t o) const; - - KOKKOS_INLINE_FUNCTION - MPICArrayKokkos& operator=(const MPICArrayKokkos& temp); - - - // Method to set comm plan for halo communication - void initialize_comm_plan(CommunicationPlan& comm_plan){ - comm_plan_ = &comm_plan; - - size_t send_size = comm_plan_->total_send_count * stride_; - size_t recv_size = comm_plan_->total_recv_count * stride_; - - if (send_size > 0) { - send_buffer_ = DCArrayKokkos(send_size, "send_buffer"); - } - if (recv_size > 0) { - recv_buffer_ = DCArrayKokkos(recv_size, "recv_buffer"); - } - - if (comm_plan_->num_send_ranks > 0) { - send_counts_ = DCArrayKokkos(comm_plan_->num_send_ranks, "send_counts"); - send_displs_ = DCArrayKokkos(comm_plan_->num_send_ranks, "send_displs"); - - for(int i = 0; i < comm_plan_->num_send_ranks; i++){ - send_counts_.host(i) = comm_plan_->send_counts_.host(i) * stride_; - send_displs_.host(i) = comm_plan_->send_displs_.host(i) * stride_; - } - send_counts_.update_device(); - send_displs_.update_device(); - } - - if (comm_plan_->num_recv_ranks > 0) { - recv_counts_ = DCArrayKokkos(comm_plan_->num_recv_ranks, "recv_counts"); - recv_displs_ = DCArrayKokkos(comm_plan_->num_recv_ranks, "recv_displs"); - - for(int i = 0; i < comm_plan_->num_recv_ranks; i++){ - recv_counts_.host(i) = comm_plan_->recv_counts_.host(i) * stride_; - recv_displs_.host(i) = comm_plan_->recv_displs_.host(i) * stride_; - } - recv_counts_.update_device(); - recv_displs_.update_device(); - } - }; - - - // GPU Method - // Method that returns size - KOKKOS_INLINE_FUNCTION - size_t size() const; - - // Host Method - // Method that returns size - KOKKOS_INLINE_FUNCTION - size_t extent() const; - - KOKKOS_INLINE_FUNCTION - size_t dims(size_t i) const; - - KOKKOS_INLINE_FUNCTION - size_t order() const; - - // Method returns the raw device pointer of the Kokkos DualView - KOKKOS_INLINE_FUNCTION - T* device_pointer() const; - - // Method returns the raw host pointer of the Kokkos DualView - KOKKOS_INLINE_FUNCTION - T* host_pointer() const; - - // Method returns kokkos dual view - KOKKOS_INLINE_FUNCTION - Kokkos::DualView get_kokkos_dual_view() const; - - // Method that update host view - void update_host(); - - // Method that update device view - void update_device(); - - // Method that builds the send buffer, note, this has to be ordered - // Such that all the boundary elements going to a given rank are contiguous in the send buffer. - void fill_send_buffer(){ - - size_t send_idx = 0; - for(int i = 0; i < comm_plan_->num_send_ranks; i++){ - for(int j = 0; j < comm_plan_->send_counts_.host(i); j++){ - size_t src_idx = comm_plan_->send_indices_.host(i, j); // index of the element to send - - // Copy all values associated with this element (handles multi-dimensional arrays) - for(size_t k = 0; k < stride_; k++){ - send_buffer_.host(send_idx + k) = this_array_.host_pointer()[src_idx * stride_ + k]; - } - send_idx += stride_; - } - } - }; - - // Method that copies the recv buffer into the this_array - void copy_recv_buffer(){ - - size_t recv_idx = 0; - for(int i = 0; i < comm_plan_->num_recv_ranks; i++){ - for(int j = 0; j < comm_plan_->recv_counts_.host(i); j++){ - size_t dest_idx = comm_plan_->recv_indices_.host(i, j); - - // Copy all values associated with this element (handles multi-dimensional arrays) - for(size_t k = 0; k < stride_; k++){ - this_array_.host_pointer()[dest_idx * stride_ + k] = recv_buffer_.host(recv_idx + k); - } - - recv_idx += stride_; - } - } - this_array_.update_device(); - }; - - - // Note: This "may" be needed, im not sure. Currently, it works.... - // Use nullptr for empty arrays to avoid accessing element 0 of 0-sized array (undefined behavior) - // T* send_buf_ptr = (send_buffer_.size() > 0) ? &send_buffer_.host(0) : nullptr; - // T* recv_buf_ptr = (recv_buffer_.size() > 0) ? &recv_buffer_.host(0) : nullptr; - // int* send_cnt_ptr = (comm_plan_->num_send_ranks > 0) ? &comm_plan_->send_counts_.host(0) : nullptr; - // int* send_dsp_ptr = (comm_plan_->num_send_ranks > 0) ? &comm_plan_->send_displs_.host(0) : nullptr; - // int* recv_cnt_ptr = (comm_plan_->num_recv_ranks > 0) ? &comm_plan_->recv_counts_.host(0) : nullptr; - // int* recv_dsp_ptr = (comm_plan_->num_recv_ranks > 0) ? &comm_plan_->recv_displs_.host(0) : nullptr; - - // Method that communicates the data between the ranks - // NOTE: This is a blocking communication operation, - // if you want to use non-blocking communication, you can use the following: MPI_Ineighbor_alltoallv - void communicate(){ - - this_array_.update_host(); - - fill_send_buffer(); - - MPI_Neighbor_alltoallv( - send_buffer_.host_pointer(), - send_counts_.host_pointer(), - send_displs_.host_pointer(), - mpi_type_map::value(), // MPI_TYPE - recv_buffer_.host_pointer(), - recv_counts_.host_pointer(), - recv_displs_.host_pointer(), - mpi_type_map::value(), // MPI_TYPE - comm_plan_->mpi_comm_graph); - - copy_recv_buffer(); - - this_array_.update_device(); - }; - - void set_values(const T& value){ - this_array_.set_values(value); - }; - - // Deconstructor - virtual KOKKOS_INLINE_FUNCTION - ~MPICArrayKokkos (); -}; // End of MPIDArrayKokkos - -// Default constructor -template -MPICArrayKokkos::MPICArrayKokkos() - : this_array_(), stride_(1), length_(0), order_(0) { - for (int i = 0; i < 7; i++) { - dims_[i] = 0; - } - } - -// Overloaded 1D constructor -template -MPICArrayKokkos::MPICArrayKokkos(size_t dim0, const std::string& tag_string) - : stride_(1), length_(dim0), order_(1) { - dims_[0] = dim0; - this_array_ = DCArrayKokkos(dim0, tag_string); - host = ViewCArray (this_array_.host_pointer(), dim0); -} - -// Overloaded 2D constructor -template -MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, const std::string& tag_string) - : stride_(dim1), length_(dim0 * dim1), order_(2) { - dims_[0] = dim0; - dims_[1] = dim1; - - this_array_ = DCArrayKokkos(dim0, dim1, tag_string); - host = ViewCArray (this_array_.host_pointer(), dim0, dim1); -} - -// Overloaded 3D constructor -template -MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, const std::string& tag_string) - : stride_(dim1 * dim2), length_(dim0 * dim1 * dim2), order_(3) { - dims_[0] = dim0; - dims_[1] = dim1; - dims_[2] = dim2; - this_array_ = DCArrayKokkos(dim0, dim1, dim2, tag_string); - host = ViewCArray (this_array_.host_pointer(), dim0, dim1, dim2); -} - -// Overloaded 4D constructor -template -MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, const std::string& tag_string) - : stride_(dim1 * dim2 * dim3), length_(dim0 * dim1 * dim2 * dim3), order_(4) { - dims_[0] = dim0; - dims_[1] = dim1; - dims_[2] = dim2; - dims_[3] = dim3; - this_array_ = DCArrayKokkos(dim0, dim1, dim2, dim3, tag_string); - host = ViewCArray (this_array_.host_pointer(), dim0, dim1, dim2, dim3); -} - -// Overloaded 5D constructor -template -MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, const std::string& tag_string) - : stride_(dim1 * dim2 * dim3 * dim4), length_(dim0 * dim1 * dim2 * dim3 * dim4), order_(5) { - dims_[0] = dim0; - dims_[1] = dim1; - dims_[2] = dim2; - dims_[3] = dim3; - dims_[4] = dim4; - this_array_ = DCArrayKokkos(dim0, dim1, dim2, dim3, dim4, tag_string); - host = ViewCArray (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4); -} - -// Overloaded 6D constructor -template -MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, const std::string& tag_string) - : stride_(dim1 * dim2 * dim3 * dim4 * dim5), length_(dim0 * dim1 * dim2 * dim3 * dim4 * dim5), order_(6) { - dims_[0] = dim0; - dims_[1] = dim1; - dims_[2] = dim2; - dims_[3] = dim3; - dims_[4] = dim4; - dims_[5] = dim5; - this_array_ = DCArrayKokkos(dim0, dim1, dim2, dim3, dim4, dim5, tag_string); - host = ViewCArray (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4, dim5); -} - -// Overloaded 7D constructor -template -MPICArrayKokkos::MPICArrayKokkos(size_t dim0, size_t dim1, size_t dim2, size_t dim3, size_t dim4, size_t dim5, size_t dim6, const std::string& tag_string) - : stride_(dim1 * dim2 * dim3 * dim4 * dim5 * dim6), length_(dim0 * dim1 * dim2 * dim3 * dim4 * dim5 * dim6), order_(7) { - dims_[0] = dim0; - dims_[1] = dim1; - dims_[2] = dim2; - dims_[3] = dim3; - dims_[4] = dim4; - dims_[5] = dim5; - dims_[6] = dim6; - this_array_ = DCArrayKokkos(dim0, dim1, dim2, dim3, dim4, dim5, dim6, tag_string); - host = ViewCArray (this_array_.host_pointer(), dim0, dim1, dim2, dim3, dim4, dim5, dim6); -} - - -template -KOKKOS_INLINE_FUNCTION -T& MPICArrayKokkos::operator()(size_t i) const { - assert(order_ == 1 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 1D!"); - assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 1D!"); - return this_array_(i); -} - -template -KOKKOS_INLINE_FUNCTION -T& MPICArrayKokkos::operator()(size_t i, size_t j) const { - assert(order_ == 2 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 2D!"); - assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 2D!"); - assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 2D!"); - return this_array_(i, j); -} - -template -KOKKOS_INLINE_FUNCTION -T& MPICArrayKokkos::operator()(size_t i, size_t j, size_t k) const { - assert(order_ == 3 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 3D!"); - assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 3D!"); - assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 3D!"); - assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 3D!"); - return this_array_(i, j, k); -} - -template -KOKKOS_INLINE_FUNCTION -T& MPICArrayKokkos::operator()(size_t i, size_t j, size_t k, size_t l) const { - assert(order_ == 4 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 4D!"); - assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 4D!"); - assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 4D!"); - assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 4D!"); - assert(l < dims_[3] && "l is out of bounds in MPICArrayKokkos 4D!"); - return this_array_(i, j, k, l); -} - -template -KOKKOS_INLINE_FUNCTION -T& MPICArrayKokkos::operator()(size_t i, size_t j, size_t k, size_t l, size_t m) const { - assert(order_ == 5 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 5D!"); - assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 5D!"); - assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 5D!"); - assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 5D!"); - assert(l < dims_[3] && "l is out of bounds in MPICArrayKokkos 5D!"); - assert(m < dims_[4] && "m is out of bounds in MPICArrayKokkos 5D!"); - return this_array_(i, j, k, l, m); -} - -template -KOKKOS_INLINE_FUNCTION -T& MPICArrayKokkos::operator()(size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) const { - assert(order_ == 6 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 6D!"); - assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 6D!"); - assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 6D!"); - assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 6D!"); - assert(l < dims_[3] && "l is out of bounds in MPICArrayKokkos 6D!"); - assert(m < dims_[4] && "m is out of bounds in MPICArrayKokkos 6D!"); - assert(n < dims_[5] && "n is out of bounds in MPICArrayKokkos 6D!"); - return this_array_(i, j, k, l, m, n); -} - -template -KOKKOS_INLINE_FUNCTION -T& MPICArrayKokkos::operator()(size_t i, size_t j, size_t k, size_t l, size_t m, size_t n, size_t o) const { - assert(order_ == 7 && "Tensor order (rank) does not match constructor in MPICArrayKokkos 7D!"); - assert(i < dims_[0] && "i is out of bounds in MPICArrayKokkos 7D!"); - assert(j < dims_[1] && "j is out of bounds in MPICArrayKokkos 7D!"); - assert(k < dims_[2] && "k is out of bounds in MPICArrayKokkos 7D!"); - assert(l < dims_[3] && "l is out of bounds in MPICArrayKokkos 7D!"); - assert(m < dims_[4] && "m is out of bounds in MPICArrayKokkos 7D!"); - assert(n < dims_[5] && "n is out of bounds in MPICArrayKokkos 7D!"); - assert(o < dims_[6] && "o is out of bounds in MPICArrayKokkos 7D!"); - return this_array_(i, j, k, l, m, n, o); -} - -template -KOKKOS_INLINE_FUNCTION -MPICArrayKokkos& MPICArrayKokkos::operator=(const MPICArrayKokkos& temp) { - this_array_ = temp.this_array_; - host = temp.host; // Also copy the host ViewCArray - comm_plan_ = temp.comm_plan_; - send_buffer_ = temp.send_buffer_; - recv_buffer_ = temp.recv_buffer_; - stride_ = temp.stride_; - return *this; -} - -// Return size -template -KOKKOS_INLINE_FUNCTION -size_t MPICArrayKokkos::size() const { - return this_array_.size(); -} - -template -KOKKOS_INLINE_FUNCTION -size_t MPICArrayKokkos::extent() const { - return this_array_.extent(); -} - -template -KOKKOS_INLINE_FUNCTION -size_t MPICArrayKokkos::dims(size_t i) const { - assert(i < order_ && "MPICArrayKokkos order (rank) does not match constructor, dim[i] does not exist!"); - assert(dims_[i]>0 && "Access to MPICArrayKokkos dims is out of bounds!"); - return this_array_.dims(i); -} - -template -KOKKOS_INLINE_FUNCTION -size_t MPICArrayKokkos::order() const { - return this_array_.order(); -} - -template -KOKKOS_INLINE_FUNCTION -T* MPICArrayKokkos::device_pointer() const { - return this_array_.device_pointer(); -} - -template -KOKKOS_INLINE_FUNCTION -T* MPICArrayKokkos::host_pointer() const { - return this_array_.host_pointer(); -} - -template -KOKKOS_INLINE_FUNCTION -Kokkos::DualView MPICArrayKokkos::get_kokkos_dual_view() const { - return this_array_.get_kokkos_dual_view(); -} - -template -void MPICArrayKokkos::update_host() { - this_array_.update_host(); -} - -template -void MPICArrayKokkos::update_device() { - this_array_.update_device(); -} - -template -KOKKOS_INLINE_FUNCTION -MPICArrayKokkos::~MPICArrayKokkos() { - -} - -} // end namespace mtr - - -// #endif // end if have MPI -#endif // end if MPICARRAYKOKKOS_H \ No newline at end of file From 9748087165154acd01d8effffe20ff92c1283929 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Mon, 17 Nov 2025 14:00:58 -0600 Subject: [PATCH 31/52] ENH: Pulling out build ghost function --- examples/mesh_decomp/decomp_utils.h | 2488 +++++++++++++------------- examples/mesh_decomp/mesh_decomp.cpp | 2 +- 2 files changed, 1246 insertions(+), 1244 deletions(-) diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h index f0e7ae4d..5cdf4a6f 100644 --- a/examples/mesh_decomp/decomp_utils.h +++ b/examples/mesh_decomp/decomp_utils.h @@ -552,1480 +552,1482 @@ void naive_partition_mesh( return; } - -/** - * @brief Partitions the input mesh using PT-Scotch and constructs the final distributed mesh. - * - * This function performs parallel mesh partitioning using a two-stage approach: - * 1. A naive partition is first constructed (simple assignment of mesh elements/nodes across ranks). - * 2. PT-Scotch is then used to repartition the mesh for load balancing and improved connectivity. - * - * The partitioned mesh, nodal data, and associated connectivity/gauss point information - * are distributed among MPI ranks as a result. The procedure ensures that each rank receives - * its assigned portion of the mesh and associated data in the final (target) decomposition. - * - * @param initial_mesh[in] The input (global) mesh, present on rank 0 or all ranks at start. - * @param final_mesh[out] The mesh assigned to this rank after PT-Scotch decomposition. - * @param initial_node[in] Nodal data for the input (global) mesh; must match initial_mesh. - * @param final_node[out] Nodal data for this rank after decomposition (corresponds to final_mesh). - * @param gauss_point[out] Gauss point data structure, filled out for this rank's mesh. - * @param world_size[in] Number of MPI ranks in use (the total number of partitions). - * @param rank[in] This process's MPI rank ID. - * - * Internals: - * - The routine uses a naive_partition_mesh() helper to create an initial contiguous mesh partition. - * - It then uses PT-Scotch distributed graph routines to compute an improved partition and create the final mesh layout. - * - Both element-to-element and node-to-element connectivity, as well as mapping and ghosting information, - * are managed and exchanged across ranks. - * - MPI routines synchronize and exchange the relevant mesh and nodal data following the computed partition. - */ - -void partition_mesh( - Mesh_t& initial_mesh, - Mesh_t& final_mesh, - node_t& initial_node, - node_t& final_node, - GaussPoint_t& gauss_point, +void build_ghost( + Mesh_t& input_mesh, + Mesh_t& output_mesh, + node_t& input_node, + node_t& output_node, + CommunicationPlan& element_communication_plan, int world_size, - int rank){ - + int rank) +{ bool print_info = false; - bool print_vtk = false; + // ****************************************************************************************** + // Build the ghost elements and nodes + // ================================================================================================** + // + // OVERVIEW OF GHOST ELEMENT IDENTIFICATION: + // ========================================== + // In distributed memory parallel computing with MPI, each processor (rank) owns a subset of mesh + // elements. However, to perform computations that depend on element neighbors or to maintain + // consistency at domain boundaries, we need ghost elements: copies of elements from neighboring + // ranks that share nodes with our locally-owned elements. + // + // This algorithm identifies and extracts ghost element data in 5 steps: + // 1. Gather ownership information: Which rank owns which elements (via MPI_Allgatherv) + // 2. Collect local element-node connectivity for distribution + // 3. Broadcast connectivity to all ranks (via MPI_Allgatherv) + // 4. Identify which remote elements touch our local elements + // 5. Extract the full connectivity data for identified ghost elements + double t_ghost_start = MPI_Wtime(); + + // ======================================================================== + // STEP 1: Gather element ownership information from all ranks + // ======================================================================== + // In a distributed mesh, each rank owns a subset of elements. To identify + // ghost elements (elements from other ranks needed by this rank), we need + // to know which rank owns each element. This section uses MPI collective + // operations to gather element GID ownership information. + // + // MPI COLLECTIVE OPERATIONS EXPLAINED: + // ==================================== + // - MPI_Barrier: Synchronizes all ranks; waits until all ranks reach this point + // - MPI_Allgather: Each rank sends one item of data; each rank receives one item from each rank + // Input: Each rank provides local data + // Output: Every rank has data from every rank in order (rank 0's data, rank 1's data, ...) + // - MPI_Allgatherv: Like MPI_Allgather but for variable-sized data + // Input: Each rank provides data of potentially different sizes + // Output: Every rank has all data from all ranks, with displacement arrays specifying where each rank's data goes + // + // COMMUNICATION PATTERN VISUALIZATION: + // Rank 0: elem_count[0] ----> All ranks receive: [elem_count[0], elem_count[1], elem_count[2], ...] + // Rank 1: elem_count[1] / + // Rank 2: elem_count[2] / - // Create mesh, gauss points, and node data structures on each rank - // This is the initial partitioned mesh - Mesh_t naive_mesh; - node_t naive_node; + int nodes_per_elem = input_mesh.num_nodes_in_elem; - // Mesh partitioned by pt-scotch, not including ghost - Mesh_t intermediate_mesh; - node_t intermediate_node; + // MPI_Allgather: Each rank sends its element count, every rank receives + // the count from every other rank. Result: elem_counts[r] = number of + // elements owned by rank r. + std::vector elem_counts(world_size); + MPI_Allgather(&input_mesh.num_elems, 1, MPI_INT, elem_counts.data(), 1, MPI_INT, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); // Synchronize all ranks before proceeding + // Compute displacements: offset into the global array for each rank's data + // Example: if elem_counts = [100, 150, 120], then + // elem_displs = [0, 100, 250] (where each rank's data starts in all_elem_gids) + std::vector elem_displs(world_size); + int total_elems = 0; + for (int r = 0; r < world_size; r++) { + elem_displs[r] = total_elems; + total_elems += elem_counts[r]; + } - // Helper arrays to hold element-element connectivity for naive partitioning that include what would be ghost, without having to build the full mesh - std::vector elems_in_elem_on_rank; - std::vector num_elems_in_elem_per_rank; + // MPI_Allgatherv: Gather variable-sized data from all ranks into one array + // Each rank contributes its local_to_global_elem_mapping, which maps + // local element indices to global element GIDs. After this call, + // all_elem_gids contains ALL element GIDs from all ranks, organized by rank. + std::vector all_elem_gids(total_elems); + MPI_Allgatherv(input_mesh.local_to_global_elem_mapping.host_pointer(), input_mesh.num_elems, MPI_UNSIGNED_LONG_LONG, + all_elem_gids.data(), elem_counts.data(), elem_displs.data(), + MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + // Build a lookup map: element GID -> owning rank + // This allows O(log n) lookups to determine which rank owns any given element. + std::map elem_gid_to_rank; + for (int rank_id = 0; rank_id < world_size; rank_id++) { + for (int i = 0; i < elem_counts[rank_id]; i++) { + size_t gid = all_elem_gids[elem_displs[rank_id] + i]; + elem_gid_to_rank[gid] = rank_id; + } + } - // Perform the naive partitioning of the mesh - naive_partition_mesh(initial_mesh, initial_node, naive_mesh, naive_node, elems_in_elem_on_rank, num_elems_in_elem_per_rank, world_size, rank); + // ======================================================================== + // STEP 2: Build element-to-node connectivity for local elements + // ======================================================================== + // Ghost elements are elements from other ranks that share nodes with our + // locally-owned elements. To identify them, we need to exchange element-node + // connectivity information with all other ranks. + // Collect all nodes that belong to our locally-owned elements + // This set will be used later to check if a remote element is relevant + std::set local_elem_nodes; + for(int node_rid = 0; node_rid < input_mesh.num_nodes; node_rid++) { + size_t node_gid = input_mesh.local_to_global_node_mapping.host(node_rid); + local_elem_nodes.insert(node_gid); + } - /********************************************************************************** - * Build PT-Scotch distributed graph representation of the mesh for repartitioning * - ********************************************************************************** - * - * This section constructs the distributed graph (SCOTCH_Dgraph) needed by PT-Scotch - * for mesh repartitioning. In this graph, each mesh element is a vertex, and edges - * correspond to mesh-neighbor relationships (i.e., elements that share a face or are - * otherwise neighbors per your mesh definition). - * - * We use the compact CSR (Compressed Sparse Row) representation, passing only the - * essential information required by PT-Scotch. - * - * Variables and structures used: - * - SCOTCH_Dgraph dgraph: - * The distributed graph instance managed by PT-Scotch. Each MPI rank creates - * and fills in its portion of the global graph. - * - * - const SCOTCH_Num baseval: - * The base value for vertex and edge numbering. Set to 0 for C-style zero-based - * arrays. Always use 0 unless you are using Fortran style 1-based arrays. - * - * - const SCOTCH_Num vertlocnbr: - * The *number of local vertices* (mesh elements) defined on this MPI rank. - * In our mesh, this is mesh.num_elems. PT-Scotch expects each rank to specify - * its own local vertex count. - * - * - const SCOTCH_Num vertlocmax: - * The *maximum number of local vertices* that could be stored (capacity). We - * allocate with no unused holes, so vertlocmax = vertlocnbr. - * - * - std::vector vertloctab: - * CSR array [size vertlocnbr+1]: for each local vertex i, vertloctab[i] - * gives the index in edgeloctab where the neighbor list of vertex i begins. - * PT-Scotch expects this array to be of size vertlocnbr+1, where the difference - * vertloctab[i+1] - vertloctab[i] gives the number of edges for vertex i. - * - * - std::vector edgeloctab: - * CSR array [variable size]: a flattened list of *neighboring element global IDs*, - * in no particular order. For vertex i, its neighbors are located at - * edgeloctab[vertloctab[i]...vertloctab[i+1]-1]. - * In this compact CSR, these are global IDs (GIDs), enabling PT-Scotch to - * recognize edges both within and across ranks. - * - * - std::map elem_gid_to_offset: - * Helper map: For a given element global ID, gives the starting offset in - * the flattened neighbor array (elems_in_elem_on_rank) where this element's - * list of neighbor GIDs begins. This allows efficient neighbor list lookup. - * - * - (other arrays used, from mesh setup and communication phase) - * - elements_on_rank: vector of global element IDs owned by this rank. - * - num_elements_on_rank: number of owned elements. - * - num_elems_in_elem_per_rank: array, for each owned element, how many - * neighbors it has. - * - elems_in_elem_on_rank: flattened array of global neighbor IDs for all local elements. - * - **********************************************************************************/ + // ======================================================================== + // STEP 3: Exchange element-to-node connectivity via MPI_Allgatherv + // ======================================================================== + // Build a flattened connectivity array: pairs of (elem_gid, node_gid) + // Example for 2 elements with 8 nodes each: + // elem_node_conn = [elem0_gid, node0, elem0_gid, node1, ..., elem1_gid, node0, ...] + // + // This format is chosen because it's easy to serialize and deserialize over MPI, + // and allows us to reconstruct the full element-node relationships. + std::vector elem_node_conn; + int local_conn_size = 0; - // --- Step 1: Initialize the PT-Scotch distributed graph object on this MPI rank --- - SCOTCH_Dgraph dgraph; - if (SCOTCH_dgraphInit(&dgraph, MPI_COMM_WORLD) != 0) { - std::cerr << "[rank " << rank << "] SCOTCH_dgraphInit failed\n"; - MPI_Abort(MPI_COMM_WORLD, 1); + // For each locally-owned element, record its GID and all its node GIDs + for (int lid = 0; lid < input_mesh.num_elems; lid++) { + size_t elem_gid = input_mesh.local_to_global_elem_mapping.host(lid); + + // Access nodes_in_elem[lid][*] to get all nodes in this element + for (int j = 0; j < input_mesh.num_nodes_in_elem; j++) { + size_t node_lid = input_mesh.nodes_in_elem.host(lid, j); // Local index + size_t node_gid = input_mesh.local_to_global_node_mapping.host(node_lid); // Global index + + elem_node_conn.push_back(elem_gid); + elem_node_conn.push_back(node_gid); + } + local_conn_size += nodes_per_elem * 2; // Each element contributes (num_nodes_in_elem * 2) size_ts } - // Set base value for numbering (0 for C-style arrays) - const SCOTCH_Num baseval = 0; - - // vertlocnbr: Number of elements (vertices) that are local to this MPI rank - const SCOTCH_Num vertlocnbr = static_cast(naive_mesh.num_elems); - // vertlocmax: Maximum possible local vertices (no holes, so identical to vertlocnbr) - const SCOTCH_Num vertlocmax = vertlocnbr; - // --- Step 2: Build compact CSR arrays for PT-Scotch (vertloctab, edgeloctab) --- - // vertloctab: for each local mesh element [vertex], gives index in edgeloctab where its neighbor list begins - std::vector vertloctab(vertlocnbr + 1); + // ======================================================================== + // Perform MPI communication to gather connectivity from all ranks + // ======================================================================== + // Similar to Step 1, we use MPI_Allgatherv to collect all element-node + // connectivity pairs. This is a two-stage process: + // 1) Gather the size of each rank's connectivity data + // 2) Gather the actual connectivity data with proper offsets - // edgeloctab: flat array of neighbor global IDs for all local elements, built in order - std::vector edgeloctab; - edgeloctab.reserve(vertlocnbr * 6); // heuristic: assume typical mesh degree is ~6, for performance + // Stage 1: Gather connectivity sizes from each rank + // conn_sizes[r] = number of size_t values that rank r will send + std::vector conn_sizes(world_size); + MPI_Allgather(&local_conn_size, 1, MPI_INT, conn_sizes.data(), 1, MPI_INT, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); - // Construct a map from element GID to its offset into elems_in_elem_on_rank (the array of neighbor GIDs) - // This allows, for a given element GID, quick lookup of where its neighbor list starts in the flat array. - std::map elem_gid_to_offset; - size_t current_offset = 0; - for (size_t k = 0; k < naive_mesh.num_elems; k++) { - int elem_gid_on_rank = naive_mesh.local_to_global_elem_mapping.host(k); - elem_gid_to_offset[elem_gid_on_rank] = current_offset; - current_offset += num_elems_in_elem_per_rank[k]; // WARNING< THIS MUST INCLUDE GHOST< WHICH DONT EXISTS ON THE NAIVE MESH + // Compute displacements for the second MPI_Allgatherv call + // Displcements tell each rank where its data should be placed in the global array + std::vector conn_displs(world_size); + int total_conn = 0; + for (int r = 0; r < world_size; r++) { + conn_displs[r] = total_conn; + total_conn += conn_sizes[r]; } - // --- Step 3: Fill in the CSR arrays, looping over each locally-owned element --- - SCOTCH_Num offset = 0; // running count of edges encountered - - for (size_t lid = 0; lid < naive_mesh.num_elems; lid++) { + // Stage 2: Gather all element-node connectivity data + // After this call, all_conn contains the flattened connectivity from every rank, + // organized by rank. Access data from rank r using indices [conn_displs[r], conn_displs[r] + conn_sizes[r]) + std::vector all_conn(total_conn); + MPI_Allgatherv(elem_node_conn.data(), local_conn_size, MPI_UNSIGNED_LONG_LONG, + all_conn.data(), conn_sizes.data(), conn_displs.data(), + MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); - // Record current edge offset for vertex lid in vertloctab - vertloctab[lid] = offset; + // ======================================================================== + // STEP 4: Identify ghost elements + // ======================================================================== + // A ghost element is an element owned by another rank that shares at least + // one node with our locally-owned elements. This step identifies all such elements. - // Obtain this local element's global ID (from mapping) - int elem_gid = naive_mesh.local_to_global_elem_mapping.host(lid); + // Build a set of locally-owned element GIDs for quick lookup + std::set local_elem_gids; + for (int i = 0; i < input_mesh.num_elems; i++) { + local_elem_gids.insert(input_mesh.local_to_global_elem_mapping.host(i)); + } - // Find offset in the flattened neighbor array for this element's neighbor list - size_t elems_in_elem_offset = elem_gid_to_offset[elem_gid]; + // Build a temporary map: node GID -> set of element GIDs (from other ranks) that contain it + // This helps us identify which remote elements are adjacent to our local elements + std::map> node_to_ext_elem; - // For this element, find the count of its neighbors - // This requires finding its index in the elements_on_rank array - size_t idx = 0; - for (size_t k = 0; k < naive_mesh.num_elems; k++) { - int elem_gid_on_rank = naive_mesh.local_to_global_elem_mapping.host(k); - if (elem_gid_on_rank == elem_gid) { - idx = k; - break; + // Iterate through connectivity data from each rank (except ourselves) + for (int r = 0; r < world_size; r++) { + if (r == rank) continue; // Skip our own data - we already know our elements + + // Parse the connectivity data for rank r + // Data format: [elem0_gid, node0, elem0_gid, node1, ..., elem1_gid, node0, ...] + // Each pair is 2 size_ts, so num_pairs = conn_sizes[r] / 2 + int num_pairs = conn_sizes[r] / 2; + + for (int i = 0; i < num_pairs; i++) { + // Offset into all_conn for this pair (elem_gid, node_gid) + int offset = conn_displs[r] + i * 2; + size_t elem_gid = all_conn[offset]; + size_t node_gid = all_conn[offset + 1]; + + // Check if this node belongs to one of our locally-owned elements + if (local_elem_nodes.find(node_gid) != local_elem_nodes.end()) { + // Check if this element is NOT owned by us (i.e., it's from another rank) + if (local_elem_gids.find(elem_gid) == local_elem_gids.end()) { + // This is a ghost element for us + node_to_ext_elem[node_gid].insert(elem_gid); + } } } - size_t num_nbrs = num_elems_in_elem_per_rank[idx]; + } - // Append each neighbor (by its GLOBAL elem GID) to edgeloctab - for (size_t j = 0; j < num_nbrs; j++) { - size_t neighbor_gid = elems_in_elem_on_rank[elems_in_elem_offset + j]; // This is a global element ID! - edgeloctab.push_back(static_cast(neighbor_gid)); - ++offset; // Increment running edge count + // Extract all unique ghost element GIDs + // We use a set to eliminate duplicates (same ghost element might share multiple nodes with us) + std::set ghost_elem_gids; + for (const auto& pair : node_to_ext_elem) { + for (size_t elem_gid : pair.second) { + ghost_elem_gids.insert(elem_gid); } } - // vertloctab[vertlocnbr] stores total number of edges written, finalizes the CSR structure - vertloctab[vertlocnbr] = offset; + // Additional check: elements that are neighbors of our locally-owned elements + // but are owned by other ranks (these might already be in ghost_elem_gids, but check connectivity) - // edgelocnbr/edgelocsiz: Number of edge endpoints defined locally - // (PT-Scotch's distributed graphs allow edges to be replicated or owned by either endpoint) - const SCOTCH_Num edgelocnbr = offset; // total number of edge endpoints (sum of all local neighbor degrees) - const SCOTCH_Num edgelocsiz = edgelocnbr; // allocated size matches number of endpoints + // for (int lid = 0; lid < num_new_elems; lid++) { + // size_t num_neighbors = input_mesh.num_elems_in_elem(lid); + + // for (size_t nbr_idx = 0; nbr_idx < num_neighbors; ++nbr_idx) { + // size_t neighbor_lid = input_mesh.elems_in_elem(lid, nbr_idx); + + // if (neighbor_lid < static_cast(num_new_elems)) { + // size_t neighbor_gid = input_mesh.local_to_global_elem_mapping(neighbor_lid); + + // // Check if neighbor is owned by this rank + // auto it = elem_gid_to_rank.find(neighbor_gid); + // if (it != elem_gid_to_rank.end() && it->second != rank) { + // // Neighbor is owned by another rank - it's a ghost for us + // std::cout << "[rank " << rank << "] found ghost element " << neighbor_gid << std::endl; + // ghost_elem_gids.insert(neighbor_gid); + // } + // } + // } + // } + + // Store the count of ghost elements for later use + input_mesh.num_ghost_elems = ghost_elem_gids.size(); - // Optionally print graph structure for debugging/validation - if (print_info) { - std::cout << "Rank " << rank << ": vertlocnbr = # of local elements(vertices) = " << vertlocnbr - << ", edgelocnbr = # of local edge endpoints = " << edgelocnbr << std::endl; - std::cout << "vertloctab (CSR row offsets): "; - for (size_t i = 0; i <= vertlocnbr; i++) { - std::cout << vertloctab[i] << " "; - } - std::cout << std::endl; - std::cout << "edgeloctab (first 20 neighbor GIDs): "; - for (size_t i = 0; i < std::min((size_t)20, edgeloctab.size()); i++) { - std::cout << edgeloctab[i] << " "; - } - std::cout << std::endl; - } MPI_Barrier(MPI_COMM_WORLD); - /************************************************************************** - * Step 4: Build the distributed graph using PT-Scotch's SCOTCH_dgraphBuild - * - * - PT-Scotch will use our CSR arrays. Since we use compact representation, - * most optional arrays ("veloloctab", "vlblloctab", "edgegsttab", "edloloctab") - * can be passed as nullptr. - * - edgeloctab contains *GLOBAL element GIDs* of neighbors. PT-Scotch uses this - * to discover connections across processor boundaries, so you do not have to - * encode ownership or partition information yourself. - **************************************************************************/ - int rc = SCOTCH_dgraphBuild( - &dgraph, - baseval, // start index (0) - vertlocnbr, // local vertex count (local elements) - vertlocmax, // local vertex max (no holes) - vertloctab.data(), // row offsets in edgeloctab - /*vendloctab*/ nullptr, // end of row offsets (compact CSR => nullptr) - /*veloloctab*/ nullptr, // vertex weights, not used - /*vlblloctab*/ nullptr, // vertex global labels (we use GIDs in edgeloctab) - edgelocnbr, // local edge endpoints count - edgelocsiz, // size of edge array - edgeloctab.data(), // global neighbor IDs for each local node - /*edgegsttab*/ nullptr, // ghost edge array, not used - /*edloloctab*/ nullptr // edge weights, not used - ); - if (rc != 0) { - std::cerr << "[rank " << rank << "] SCOTCH_dgraphBuild failed rc=" << rc << "\n"; - SCOTCH_dgraphFree(&dgraph); - MPI_Abort(MPI_COMM_WORLD, rc); + + // ======================================================================== + // STEP 5: Extract ghost element connectivity + // ======================================================================== + // Now that we know which elements are ghosts, we need to extract their + // full node connectivity from all_conn. This allows us to properly construct + // the extended mesh with ghost elements included. + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout << " Starting to build extended mesh with ghost elements" << std::endl; + + // Build a map: ghost_elem_gid -> vector of node_gids + // We pre-allocate the vector size to avoid repeated reallocations + std::map> ghost_elem_to_nodes; + for (const size_t& ghost_gid : ghost_elem_gids) { + ghost_elem_to_nodes[ghost_gid].reserve(input_mesh.num_nodes_in_elem); } - // Optionally, print rank summary after graph build for further validation - if (print_info) { - SCOTCH_Num vertlocnbr_out; - SCOTCH_dgraphSize(&dgraph, &vertlocnbr_out, nullptr, nullptr, nullptr); - std::cout << "Rank " << rank << ": After dgraphBuild, vertlocnbr = " << vertlocnbr_out << std::endl; + // ======================================================================== + // Extract nodes for each ghost element from the globally-collected all_conn + // ======================================================================== + // The all_conn array was populated by MPI_Allgatherv and contains connectivity + // pairs (elem_gid, node_gid) for all elements from all ranks. We now parse + // this data to extract the nodes for each ghost element. + for (int r = 0; r < world_size; r++) { + if (r == rank) continue; // Skip our own data - we already have owned element connectivity + + // Parse connectivity data for rank r + int num_pairs = conn_sizes[r] / 2; + + for (int i = 0; i < num_pairs; i++) { + // Calculate offset for this pair: displacement + (pair_index * 2) + int offset = conn_displs[r] + i * 2; + size_t elem_gid = all_conn[offset]; + size_t node_gid = all_conn[offset + 1]; + + // If this element is one of our identified ghost elements, record its node + auto it = ghost_elem_to_nodes.find(elem_gid); + if (it != ghost_elem_to_nodes.end()) { + it->second.push_back(node_gid); + } + } } - MPI_Barrier(MPI_COMM_WORLD); - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout<<" Finished building the distributed graph using PT-Scotch"<(input_mesh.num_nodes_in_elem)) { + std::cerr << "[rank " << rank << "] ERROR: Ghost element " << pair.first + << " has " << pair.second.size() << " nodes, expected " << input_mesh.num_nodes_in_elem << std::endl; + } + } - /******************************************************** - * Step 5: Validate the graph using SCOTCH_dgraphCheck - ********************************************************/ - rc = SCOTCH_dgraphCheck(&dgraph); - if (rc != 0) { - std::cerr << "[rank " << rank << "] SCOTCH_dgraphCheck failed rc=" << rc << "\n"; - SCOTCH_dgraphFree(&dgraph); - MPI_Abort(MPI_COMM_WORLD, rc); + // Step 2: Build extended node list (owned nodes first, then ghost-only nodes) + // Start with owned nodes + std::map node_gid_to_extended_lid; + int extended_node_lid = 0; + + // Add all owned nodes + for (int i = 0; i < input_mesh.num_nodes; i++) { + size_t node_gid = input_mesh.local_to_global_node_mapping.host(i); + node_gid_to_extended_lid[node_gid] = extended_node_lid++; } - /************************************************************** - * Step 6: Partition (repartition) the mesh using PT-Scotch - * - Each vertex (mesh element) will be assigned a part (mesh chunk). - * - Arch is initialized for a complete graph of world_size parts (one per rank). - **************************************************************/ - // SCOTCH_Arch controls the "architecture" for partitioning: the topology - // (number and connectivity of parts) to which the graph will be mapped. - // The archdat variable encodes this. Below are common options: - // - // - SCOTCH_archCmplt(&archdat, nbparts) - // * Creates a "complete graph" architecture with nbparts nodes (fully connected). - // Every part is equally distant from every other part. - // This is typically used when minimizing only *balance* and *edge cut*, - // not considering any underlying machine topology. - // - // - SCOTCH_archHcub(&archdat, dimension) - // * Hypercube architecture (rare in modern use). - // Sets up a hypercube of given dimension. - // - // - SCOTCH_archTleaf / SCOTCH_archTleafX - // * Tree architectures, for hierarchically structured architectures. - // - // - SCOTCH_archMesh2 / SCOTCH_archMesh3 - // * 2D or 3D mesh topology architectures (useful for grid/matrix machines). - // - // - SCOTCH_archBuild - // * General: builds any architecture from a descriptor string. - // - // For distributed mesh partitioning to MPI ranks (where all ranks are equal), - // the most common and appropriate is "complete graph" (Cmplt): each part (rank) - // is equally reachable from any other (no communication topology bias). - SCOTCH_Arch archdat; // PT-Scotch architecture structure: describes desired partition topology - SCOTCH_archInit(&archdat); - // Partition into 'world_size' equally connected parts (each MPI rank is a "node") - // Other topology options could be substituted above according to your needs (see docs). - SCOTCH_archCmplt(&archdat, static_cast(world_size)); + // Add ghost-only nodes (nodes that belong to ghost elements but not to owned elements) + std::set ghost_only_nodes; + for (const auto& pair : ghost_elem_to_nodes) { + for (size_t node_gid : pair.second) { + // Check if we already have this node + if (node_gid_to_extended_lid.find(node_gid) == node_gid_to_extended_lid.end()) { + ghost_only_nodes.insert(node_gid); + } + } + } + // Assign extended local IDs to ghost-only nodes + for (size_t node_gid : ghost_only_nodes) { + node_gid_to_extended_lid[node_gid] = extended_node_lid++; + } + int total_extended_nodes = extended_node_lid; - - // ===================== PT-Scotch Strategy Selection and Documentation ====================== - // The PT-Scotch "strategy" (stratdat here) controls the algorithms and heuristics used for partitioning. - // You can specify a string or build a strategy using functions that adjust speed, quality, and recursion. - // - // Common strategy flags (see "scotch.h", "ptscotch.h", and PT-Scotch documentation): - // - // - SCOTCH_STRATDEFAULT: Use the default (fast, reasonable quality) partitioning strategy. - // Useful for quick, generic partitions where quality is not critical. - // - // - SCOTCH_STRATSPEED: Aggressively maximizes speed (at the cost of cut quality). - // For large runs or test runs where speed is more important than minimizing edgecut. - // - // - SCOTCH_STRATQUALITY: Prioritizes partition *quality* (minimizing edge cuts, maximizing load balance). - // Slower than the default. Use when high-quality partitioning is desired. - // - // - SCOTCH_STRATBALANCE: Tradeoff between speed and quality for balanced workload across partitions. - // Use if load balance is more critical than cut size. - // - // Additional Options: - // - Strategy can also be specified as a string (see Scotch manual, e.g., "b{sep=m{...} ...}"). - // - Recursion count parameter (here, set to 0) controls strategy recursion depth (0 = automatic). - // - Imbalance ratio (here, 0.01) allows minor imbalance in part weight for better cut quality. - // - // Example usage: - // SCOTCH_stratDgraphMapBuild(&strat, SCOTCH_STRATQUALITY, nparts, 0, 0.01); - // ^ quality-focused, nparts=number of parts/ranks - // SCOTCH_stratDgraphMapBuild(&strat, SCOTCH_STRATSPEED, nparts, 0, 0.05); - // ^ speed-focused, allow 5% imbalance - // - // Reference: - // - https://gitlab.inria.fr/scotch/scotch/-/blob/master/doc/libptscotch.pdf - // - SCOTCH_stratDgraphMapBuild() and related "strategy" documentation. - // - // --------------- Set up the desired partitioning strategy here: --------------- - SCOTCH_Strat stratdat; // PT-Scotch strategy object: holds partitioning options/settings - SCOTCH_stratInit(&stratdat); + // Step 3: Prepare requests for ghost node coordinates from owning ranks (if needed later) + // Build request list: for each ghost node, find an owning rank via any ghost element that contains it + std::map> rank_to_ghost_node_requests; + for (size_t node_gid : ghost_only_nodes) { + // Find which rank owns an element containing this node + // Look through ghost elements + for (const auto& pair : ghost_elem_to_nodes) { + size_t ghost_elem_gid = pair.first; + const std::vector& nodes = pair.second; + bool found = false; + for (size_t ngid : nodes) { + if (ngid == node_gid) { + found = true; + break; + } + } + if (found) { + auto owner_it = elem_gid_to_rank.find(ghost_elem_gid); + if (owner_it != elem_gid_to_rank.end()) { + rank_to_ghost_node_requests[owner_it->second].push_back(node_gid); + break; + } + } + } + } - // Select partitioning strategy for this run: - // Use SCOTCH_STRATQUALITY for best cut quality. - // To change: replace with SCOTCH_STRATDEFAULT, SCOTCH_STRATSPEED, or SCOTCH_STRATBALANCE as discussed above. - // Arguments: (strategy object, strategy flag, #parts, recursion (0=auto), imbalance ratio) - SCOTCH_stratDgraphMapBuild(&stratdat, SCOTCH_STRATQUALITY, world_size, 0, 0.001); + // Step 4: Build extended element list and node connectivity + // Owned elements: 0 to num_new_elems-1 (already have these) + // Ghost elements: num_new_elems to num_new_elems + num_ghost_elems - 1 - // partloctab: output array mapping each local element (vertex) to a *target partition number* - // After partitioning, partloctab[i] gives the part-assignment (in [0,world_size-1]) for local element i. - std::vector partloctab(vertlocnbr); - rc = SCOTCH_dgraphMap(&dgraph, &archdat, &stratdat, partloctab.data()); - if (rc != 0) { - std::cerr << "[rank " << rank << "] SCOTCH_dgraphMap failed rc=" << rc << "\n"; - SCOTCH_stratExit(&stratdat); - SCOTCH_archExit(&archdat); - SCOTCH_dgraphFree(&dgraph); - MPI_Abort(MPI_COMM_WORLD, rc); + // Create extended element-node connectivity array + int total_extended_elems = input_mesh.num_elems + input_mesh.num_ghost_elems; + std::vector> extended_nodes_in_elem(total_extended_elems); + + // Copy owned element connectivity (convert to extended node LIDs) + for (int lid = 0; lid < input_mesh.num_elems; lid++) { + extended_nodes_in_elem[lid].reserve(nodes_per_elem); + for (int j = 0; j < nodes_per_elem; j++) { + size_t node_lid = input_mesh.nodes_in_elem.host(lid, j); + size_t node_gid = input_mesh.local_to_global_node_mapping.host(node_lid); + int ext_lid = node_gid_to_extended_lid[node_gid]; + extended_nodes_in_elem[lid].push_back(ext_lid); + } } - // Clean up PT-Scotch strategy and architecture objects - SCOTCH_stratExit(&stratdat); - SCOTCH_archExit(&archdat); - - // Free the graph now that we have the partition assignments - SCOTCH_dgraphFree(&dgraph); + // Add ghost element connectivity (map ghost node GIDs to extended node LIDs) + int ghost_elem_ext_lid = input_mesh.num_elems; + std::vector ghost_elem_gids_ordered(ghost_elem_gids.begin(), ghost_elem_gids.end()); + std::sort(ghost_elem_gids_ordered.begin(), ghost_elem_gids_ordered.end()); - /*************************************************************************** - * Step 7 (Optional): Print out the partitioning assignment per element - * - Each local element's local index lid and global ID (gid) are listed with the - * part to which PT-Scotch has assigned them. - ***************************************************************************/ - print_info = false; - for(int rank_id = 0; rank_id < world_size; rank_id++) { - if(rank_id == rank && print_info) { - for (size_t lid = 0; lid < naive_mesh.num_elems; lid++) { - size_t gid = naive_mesh.local_to_global_elem_mapping.host(lid); - std::cout << "[rank " << rank_id << "] elem_local=" << lid << " gid=" << gid - << " -> part=" << partloctab[lid] << "\n"; + for (size_t ghost_gid : ghost_elem_gids_ordered) { + auto it = ghost_elem_to_nodes.find(ghost_gid); + if (it == ghost_elem_to_nodes.end()) continue; + + extended_nodes_in_elem[ghost_elem_ext_lid].reserve(nodes_per_elem); + for (size_t node_gid : it->second) { + int ext_lid = node_gid_to_extended_lid[node_gid]; + extended_nodes_in_elem[ghost_elem_ext_lid].push_back(ext_lid); + } + ghost_elem_ext_lid++; + } + + MPI_Barrier(MPI_COMM_WORLD); + // Sequential rank-wise printing of extended mesh structure info + if(print_info) { + for (int r = 0; r < world_size; r++) { + MPI_Barrier(MPI_COMM_WORLD); + if (rank == r) { + std::cout << "[rank " << rank << "] Finished building extended mesh structure" << std::endl; + std::cout << "[rank " << rank << "] - Owned elements: " << input_mesh.num_elems << std::endl; + std::cout << "[rank " << rank << "] - Ghost elements: " << ghost_elem_gids.size() << std::endl; + std::cout << "[rank " << rank << "] - Total extended elements: " << total_extended_elems << std::endl; + std::cout << "[rank " << rank << "] - Owned nodes: " << input_mesh.num_nodes << std::endl; + std::cout << "[rank " << rank << "] - Ghost-only nodes: " << ghost_only_nodes.size() << std::endl; + std::cout << "[rank " << rank << "] - Total extended nodes: " << total_extended_nodes << std::endl; + std::cout << std::flush; } MPI_Barrier(MPI_COMM_WORLD); } - MPI_Barrier(MPI_COMM_WORLD); } - print_info = false; + // The extended_nodes_in_elem vector now contains the connectivity for both owned and ghost elements + // Each element's nodes are stored using extended local node IDs (0-based, contiguous) -// ****************************************************************************************** -// Build the final mesh from the repartition -// ****************************************************************************************** + // Build reverse maps: extended_lid -> gid for nodes and elements + std::vector extended_lid_to_node_gid(total_extended_nodes); + for (const auto& pair : node_gid_to_extended_lid) { + extended_lid_to_node_gid[pair.second] = pair.first; + } + // Build extended element GID list: owned first, then ghost + std::vector extended_lid_to_elem_gid(total_extended_elems); + // Owned elements + for (int i = 0; i < input_mesh.num_elems; i++) { + extended_lid_to_elem_gid[i] = input_mesh.local_to_global_elem_mapping.host(i); + } - MPI_Barrier(MPI_COMM_WORLD); - if (rank == 0) std::cout << "\n=== Starting Mesh Redistribution Phase ===\n"; - MPI_Barrier(MPI_COMM_WORLD); + // Ghost elements (in sorted order) + for (size_t i = 0; i < ghost_elem_gids_ordered.size(); i++) { + extended_lid_to_elem_gid[input_mesh.num_elems + i] = ghost_elem_gids_ordered[i]; + } - // -------------- Phase 1: Determine elements to send to each rank -------------- - std::vector> elems_to_send(world_size); - for (int lid = 0; lid < naive_mesh.num_elems; lid++) { - int dest = static_cast(partloctab[lid]); - int elem_gid = static_cast(naive_mesh.local_to_global_elem_mapping.host(lid)); - elems_to_send[dest].push_back(elem_gid); + // Build array: for each ghost element, store which rank owns it (where to receive data from) + std::vector ghost_elem_owner_ranks(ghost_elem_gids_ordered.size()); + for (size_t i = 0; i < ghost_elem_gids_ordered.size(); i++) { + size_t ghost_gid = ghost_elem_gids_ordered[i]; + auto it = elem_gid_to_rank.find(ghost_gid); + if (it != elem_gid_to_rank.end()) { + ghost_elem_owner_ranks[i] = it->second; + } else { + std::cerr << "[rank " << rank << "] ERROR: Ghost element GID " << ghost_gid + << " not found in elem_gid_to_rank map!" << std::endl; + ghost_elem_owner_ranks[i] = -1; // Invalid rank as error indicator + } } - // -------------- Phase 2: Exchange element GIDs -------------- - std::vector sendcounts(world_size), recvcounts(world_size); - for (int r = 0; r < world_size; r++) - sendcounts[r] = static_cast(elems_to_send[r].size()); + // Create a std::set of all the ranks this rank will receive data from + std::set ghost_elem_receive_ranks; + for (size_t i = 0; i < ghost_elem_gids_ordered.size(); i++) { + ghost_elem_receive_ranks.insert(ghost_elem_owner_ranks[i]); + } - MPI_Alltoall(sendcounts.data(), 1, MPI_INT, recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD); + // ****************************************************************************************** + // Build the final partitioned mesh + // ****************************************************************************************** - MPI_Barrier(MPI_COMM_WORLD); - // Compute displacements - std::vector sdispls(world_size), rdispls(world_size); - int send_total = 0, recv_total = 0; - for (int r = 0; r < world_size; r++) { - sdispls[r] = send_total; - rdispls[r] = recv_total; - send_total += sendcounts[r]; - recv_total += recvcounts[r]; + output_mesh.initialize_nodes(total_extended_nodes); + output_mesh.initialize_elems(total_extended_elems, 3); + output_mesh.local_to_global_node_mapping = DCArrayKokkos(total_extended_nodes); + output_mesh.local_to_global_elem_mapping = DCArrayKokkos(total_extended_elems); + for (int i = 0; i < total_extended_nodes; i++) { + output_mesh.local_to_global_node_mapping.host(i) = extended_lid_to_node_gid[i]; + } + for (int i = 0; i < total_extended_elems; i++) { + output_mesh.local_to_global_elem_mapping.host(i) = extended_lid_to_elem_gid[i]; } + output_mesh.local_to_global_node_mapping.update_device(); + output_mesh.local_to_global_elem_mapping.update_device(); + output_mesh.num_ghost_elems = ghost_elem_gids.size(); + output_mesh.num_ghost_nodes = ghost_only_nodes.size(); - // Flatten send buffer - // send_elems: flattened list of element global IDs (GIDs) that this rank is sending to all other ranks. - // For each rank r, elems_to_send[r] contains the element GIDs that should be owned by rank r after repartitioning. - std::vector send_elems; - send_elems.reserve(send_total); - for (int r = 0; r < world_size; r++) - send_elems.insert(send_elems.end(), elems_to_send[r].begin(), elems_to_send[r].end()); - // new_elem_gids: receives the list of new element global IDs this rank will own after the exchange. - // It is filled after MPI_Alltoallv completes, and contains the GIDs for the elements new to (or remained on) this rank. - std::vector new_elem_gids(recv_total); - MPI_Alltoallv(send_elems.data(), sendcounts.data(), sdispls.data(), MPI_INT, - new_elem_gids.data(), recvcounts.data(), rdispls.data(), MPI_INT, MPI_COMM_WORLD); - + output_mesh.num_owned_elems = input_mesh.num_elems; + output_mesh.num_owned_nodes = input_mesh.num_nodes; + MPI_Barrier(MPI_COMM_WORLD); - - // New elements owned by this rank - int num_new_elems = static_cast(new_elem_gids.size()); - - if (print_info) { - std::cout << "[rank " << rank << "] new elems: " << num_new_elems << std::endl; + // rebuild the local element-node connectivity using the local node ids + // extended_nodes_in_elem already contains extended local node IDs, so we can use them directly + for(int i = 0; i < total_extended_elems; i++) { + for(int j = 0; j < nodes_per_elem; j++) { + output_mesh.nodes_in_elem.host(i, j) = extended_nodes_in_elem[i][j]; + } } - // -------------- Phase 3: Send element–node connectivity -------------- - int nodes_per_elem = naive_mesh.num_nodes_in_elem; + MPI_Barrier(MPI_COMM_WORLD); - // Flatten element-node connectivity by global node IDs - std::vector conn_sendbuf; - for (int r = 0; r < world_size; r++) { - for (int elem_gid : elems_to_send[r]) { - // find local element lid from elem_gid - int lid = -1; - for (int i = 0; i < naive_mesh.num_elems; i++) - if (naive_mesh.local_to_global_elem_mapping.host(i) == elem_gid) { lid = i; break; } + double t_ghost_end = MPI_Wtime(); - for (int j = 0; j < nodes_per_elem; j++) { - int node_lid = naive_mesh.nodes_in_elem.host(lid, j); - int node_gid = naive_mesh.local_to_global_node_mapping.host(node_lid); - conn_sendbuf.push_back(node_gid); - } - } + if (rank == 0) { + std::cout << " Finished calculating ghost elements" << std::endl; + std::cout << " Ghost element calculation took " << (t_ghost_end - t_ghost_start) << " seconds." << std::endl; } - // element-node connectivity counts (ints per dest rank) - std::vector conn_sendcounts(world_size), conn_recvcounts(world_size); - for (int r = 0; r < world_size; r++) - conn_sendcounts[r] = sendcounts[r] * nodes_per_elem; - - MPI_Alltoall(conn_sendcounts.data(), 1, MPI_INT, conn_recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD); - + output_mesh.nodes_in_elem.update_device(); + output_mesh.build_connectivity(); MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout<<" Finished exchanging element–node connectivity counts"< conn_sdispls(world_size), conn_rdispls(world_size); - int conn_send_total = 0, conn_recv_total = 0; - for (int r = 0; r < world_size; r++) { - conn_sdispls[r] = conn_send_total; - conn_rdispls[r] = conn_recv_total; - conn_send_total += conn_sendcounts[r]; - conn_recv_total += conn_recvcounts[r]; - } + if(rank == 0) std::cout << " Finished building final mesh structure with ghost nodes and elements" << std::endl; + MPI_Barrier(MPI_COMM_WORLD); - std::vector conn_recvbuf(conn_recv_total); - MPI_Alltoallv(conn_sendbuf.data(), conn_sendcounts.data(), conn_sdispls.data(), MPI_INT, - conn_recvbuf.data(), conn_recvcounts.data(), conn_rdispls.data(), MPI_INT, MPI_COMM_WORLD); + // ****************************************************************************************** + // Build the final nodes that include ghost + // ****************************************************************************************** - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout<<" Finished exchanging element–node connectivity"< node_gid_set(conn_recvbuf.begin(), conn_recvbuf.end()); - std::vector new_node_gids(node_gid_set.begin(), node_gid_set.end()); - int num_new_nodes = static_cast(new_node_gids.size()); + output_node.initialize(total_extended_nodes, 3, {node_state::coords}); - // Build map gid→lid - std::unordered_map node_gid_to_lid; - for (int i = 0; i < num_new_nodes; i++) - node_gid_to_lid[new_node_gids[i]] = i; + // The goal here is to populate output_node.coords using globally gathered ghost node coordinates, + // since input_node does not contain ghost node coordinates. + // + // Each rank will: + // 1. Gather coordinates of its owned nodes (from input_node). + // 2. Use MPI to gather all coordinates for all required (owned + ghost) global node IDs + // into a structure mapping global ID -> coordinate. + // 3. Use this map to fill output_node.coords. - if (print_info) - std::cout << "[rank " << rank << "] owns " << num_new_nodes << " unique nodes\n"; + // 1. Build list of all global node IDs needed on this rank (owned + ghosts) + std::vector all_needed_node_gids(total_extended_nodes); + for (int i = 0; i < total_extended_nodes; i++) { + all_needed_node_gids[i] = output_mesh.local_to_global_node_mapping.host(i); + } + // 2. Build owned node GIDs and their coordinates + std::vector owned_gids(output_mesh.num_owned_nodes); + for (int i = 0; i < output_mesh.num_owned_nodes; i++) + owned_gids[i] = output_mesh.local_to_global_node_mapping.host(i); - // -------------- Phase 5: Request node coordinates -------------- - std::vector node_coords_sendbuf; - for (int r = 0; r < world_size; r++) { - for (int gid : elems_to_send[r]) { - int lid = -1; - for (int i = 0; i < naive_mesh.num_elems; i++) - if (naive_mesh.local_to_global_elem_mapping.host(i) == gid) { lid = i; break; } + // 3. Gather all GIDs in the world that are needed anywhere (owned or ghosted, by any rank) + // so we can distribute the needed coordinate data. + // The easiest is to Allgather everyone's "owned_gids" and coords - for (int j = 0; j < nodes_per_elem; j++) { - int node_lid = naive_mesh.nodes_in_elem.host(lid, j); - int node_gid = naive_mesh.local_to_global_node_mapping.host(node_lid); + int local_owned_count = static_cast(owned_gids.size()); + std::vector owned_counts(world_size, 0); + if (local_owned_count < 0) local_owned_count = 0; // Clean up possibility of -1 - node_coords_sendbuf.push_back(naive_node.coords.host(node_lid, 0)); - node_coords_sendbuf.push_back(naive_node.coords.host(node_lid, 1)); - node_coords_sendbuf.push_back(naive_node.coords.host(node_lid, 2)); - } - } + // a) Gather counts + owned_counts.resize(world_size, 0); + MPI_Allgather(&local_owned_count, 1, MPI_INT, owned_counts.data(), 1, MPI_INT, MPI_COMM_WORLD); + + // b) Displacements and total + std::vector owned_displs(world_size,0); + int total_owned = 0; + for (int r = 0; r < world_size; r++) { + owned_displs[r] = total_owned; + total_owned += owned_counts[r]; } - // Each node is 3 doubles; same sendcounts scaling applies - std::vector coord_sendcounts(world_size), coord_recvcounts(world_size); - for (int r = 0; r < world_size; r++) - coord_sendcounts[r] = sendcounts[r] * nodes_per_elem * 3; + // c) Global GIDs (size: total_owned) + std::vector all_owned_gids(total_owned); + MPI_Allgatherv(owned_gids.data(), local_owned_count, MPI_UNSIGNED_LONG_LONG, + all_owned_gids.data(), owned_counts.data(), owned_displs.data(), + MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); - MPI_Alltoall(coord_sendcounts.data(), 1, MPI_INT, coord_recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD); - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout<<" Finished exchanging node coordinates counts"< coord_sdispls(world_size), coord_rdispls(world_size); - int coord_send_total = 0, coord_recv_total = 0; + // d) Global coords (size: total_owned x 3) + std::vector owned_coords_send(3*local_owned_count, 0.0); + for (int i = 0; i < local_owned_count; i++) { + owned_coords_send[3*i+0] = input_node.coords.host(i,0); + owned_coords_send[3*i+1] = input_node.coords.host(i,1); + owned_coords_send[3*i+2] = input_node.coords.host(i,2); + } + std::vector all_owned_coords(3 * total_owned, 0.0); + + // Create coordinate-specific counts and displacements (in units of doubles, not nodes) + std::vector coord_counts(world_size); + std::vector coord_displs(world_size); for (int r = 0; r < world_size; r++) { - coord_sdispls[r] = coord_send_total; - coord_rdispls[r] = coord_recv_total; - coord_send_total += coord_sendcounts[r]; - coord_recv_total += coord_recvcounts[r]; + coord_counts[r] = 3 * owned_counts[r]; // Each node has 3 doubles + coord_displs[r] = 3 * owned_displs[r]; // Displacement in doubles } - std::vector coord_recvbuf(coord_recv_total); - MPI_Alltoallv(node_coords_sendbuf.data(), coord_sendcounts.data(), coord_sdispls.data(), MPI_DOUBLE, - coord_recvbuf.data(), coord_recvcounts.data(), coord_rdispls.data(), MPI_DOUBLE, MPI_COMM_WORLD); + MPI_Allgatherv(owned_coords_send.data(), 3*local_owned_count, MPI_DOUBLE, + all_owned_coords.data(), coord_counts.data(), coord_displs.data(), + MPI_DOUBLE, MPI_COMM_WORLD); - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout<<" Finished exchanging node coordinates"< coord[3] + std::unordered_map> gid_to_coord; + for (int i = 0; i < total_owned; i++) { + std::array xyz = { + all_owned_coords[3*i+0], + all_owned_coords[3*i+1], + all_owned_coords[3*i+2] + }; + gid_to_coord[all_owned_gids[i]] = xyz; + } - // -------------- Phase 6: Build the intermediate_mesh -------------- - intermediate_mesh.initialize_nodes(num_new_nodes); - intermediate_mesh.initialize_elems(num_new_elems, naive_mesh.num_dims); - intermediate_mesh.local_to_global_node_mapping = DCArrayKokkos(num_new_nodes); - intermediate_mesh.local_to_global_elem_mapping = DCArrayKokkos(num_new_elems); + // 4. Finally, fill output_node.coords with correct coordinates. + for (int i = 0; i < total_extended_nodes; i++) { + size_t gid = output_mesh.local_to_global_node_mapping.host(i); + auto it = gid_to_coord.find(gid); + if (it != gid_to_coord.end()) { + output_node.coords.host(i,0) = it->second[0]; + output_node.coords.host(i,1) = it->second[1]; + output_node.coords.host(i,2) = it->second[2]; + } else { + // Could happen if there's a bug: fill with zeros for safety + output_node.coords.host(i,0) = 0.0; + output_node.coords.host(i,1) = 0.0; + output_node.coords.host(i,2) = 0.0; + } + } + output_node.coords.update_device(); - // Fill global mappings - for (int i = 0; i < num_new_nodes; i++) - intermediate_mesh.local_to_global_node_mapping.host(i) = new_node_gids[i]; - for (int i = 0; i < num_new_elems; i++) - intermediate_mesh.local_to_global_elem_mapping.host(i) = new_elem_gids[i]; - intermediate_mesh.local_to_global_node_mapping.update_device(); - intermediate_mesh.local_to_global_elem_mapping.update_device(); + // -------------------------------------------------------------------------------------- + // Build the send patterns for elements + // Build reverse map via global IDs: for each local element gid, find ranks that ghost it. + // Steps: + // 1) Each rank contributes its ghost element GIDs. + // 2) Allgatherv ghost GIDs to build gid -> [ranks that ghost it]. + // 3) For each locally-owned element gid, lookup ranks that ghost it and record targets. + // -------------------------------------------------------------------------------------- + std::vector>> boundary_elem_targets(output_mesh.num_owned_elems); + // Prepare local ghost list as vector + std::vector ghost_gids_vec; + ghost_gids_vec.reserve(output_mesh.num_ghost_elems); + for (int i = 0; i < output_mesh.num_ghost_elems; i++) { + ghost_gids_vec.push_back(output_mesh.local_to_global_elem_mapping.host(output_mesh.num_owned_elems + i)); // Ghost elements are after the owned elements in the global element mapping + } - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout<<" Starting reverse mapping of the element-node connectivity from the global node ids to the local node ids"< ghost_counts(world_size, 0); + int local_ghost_count = output_mesh.num_ghost_elems; + MPI_Allgather(&local_ghost_count, 1, MPI_INT, ghost_counts.data(), 1, MPI_INT, MPI_COMM_WORLD); - int node_gid = conn_recvbuf[i * intermediate_mesh.num_nodes_in_elem + j]; + // Displacements and recv buffer + std::vector ghost_displs(world_size, 0); + int total_ghosts = 0; + for (int r = 0; r < world_size; r++) { + ghost_displs[r] = total_ghosts; + total_ghosts += ghost_counts[r]; + } + std::vector all_ghost_gids(total_ghosts); - int node_lid = -1; + // Gather ghost gids + MPI_Allgatherv(ghost_gids_vec.data(), local_ghost_count, MPI_UNSIGNED_LONG_LONG, + all_ghost_gids.data(), ghost_counts.data(), ghost_displs.data(), + MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); - // Binary search through local_to_global_node_mapping to find the equivalent local index - int left = 0, right = num_new_nodes - 1; - while (left <= right) { - int mid = left + (right - left) / 2; - size_t mid_gid = intermediate_mesh.local_to_global_node_mapping.host(mid); - if (node_gid == mid_gid) { - node_lid = mid; - break; - } else if (node_gid < mid_gid) { - right = mid - 1; - } else { - left = mid + 1; - } - } - intermediate_mesh.nodes_in_elem.host(i, j) = node_lid; + + // Build map gid -> ranks that ghost it + std::unordered_map> gid_to_ghosting_ranks; + gid_to_ghosting_ranks.reserve(static_cast(total_ghosts)); + for (int r = 0; r < world_size; r++) { + int cnt = ghost_counts[r]; + int off = ghost_displs[r]; + for (int i = 0; i < cnt; i++) { + size_t g = all_ghost_gids[off + i]; + gid_to_ghosting_ranks[g].push_back(r); + } + } + + // For each local element, list destinations: ranks that ghost our gid + for (int elem_lid = 0; elem_lid < output_mesh.num_owned_elems; elem_lid++) { + size_t local_elem_gid = output_mesh.local_to_global_elem_mapping.host(elem_lid); + auto it = gid_to_ghosting_ranks.find(local_elem_gid); + if (it == gid_to_ghosting_ranks.end()) continue; + const std::vector &dest_ranks = it->second; + for (int rr : dest_ranks) { + if (rr == rank) continue; + boundary_elem_targets[elem_lid].push_back(std::make_pair(rr, local_elem_gid)); } } MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout<<" Finished reverse mapping of the element-node connectivity from the global node ids to the local node ids"< boundary_elem_local_ids; + std::vector> boundary_to_ghost_ranks; // ragged array dimensions (num_boundary_elems, num_ghost_ranks) - // Fill node coordinates - // coord_recvbuf contains coords in element-node order, but we need them in node order - // Build a map from node GID to coordinates - std::map> node_gid_to_coords; - int coord_idx = 0; - for (int e = 0; e < intermediate_mesh.num_elems; ++e) { - for (int j = 0; j < intermediate_mesh.num_nodes_in_elem; j++) { - int node_gid = conn_recvbuf[e * intermediate_mesh.num_nodes_in_elem + j]; - if (node_gid_to_coords.find(node_gid) == node_gid_to_coords.end()) { - node_gid_to_coords[node_gid] = { - coord_recvbuf[coord_idx*3 + 0], - coord_recvbuf[coord_idx*3 + 1], - coord_recvbuf[coord_idx*3 + 2] - }; + std::set ghost_comm_ranks; // set of ranks that this rank communicates with + + + for (int elem_lid = 0; elem_lid < output_mesh.num_owned_elems; elem_lid++) { + + int local_elem_gid = output_mesh.local_to_global_elem_mapping.host(elem_lid); + if (boundary_elem_targets[elem_lid].empty()) + { + continue; + } + else + { + // Fill in vector of boundary local_ids + boundary_elem_local_ids.push_back(elem_lid); + std::vector ghost_ranks_for_this_boundary_elem; + for (const auto &pr : boundary_elem_targets[elem_lid]) { + ghost_ranks_for_this_boundary_elem.push_back(pr.first); + ghost_comm_ranks.insert(pr.first); } - coord_idx++; + boundary_to_ghost_ranks.push_back(ghost_ranks_for_this_boundary_elem); } } - - // Now fill coordinates in node order - intermediate_node.initialize(num_new_nodes, 3, {node_state::coords}); - for (int i = 0; i < num_new_nodes; i++) { - int node_gid = new_node_gids[i]; - auto it = node_gid_to_coords.find(node_gid); - if (it != node_gid_to_coords.end()) { - intermediate_node.coords.host(i, 0) = it->second[0]; - intermediate_node.coords.host(i, 1) = it->second[1]; - intermediate_node.coords.host(i, 2) = it->second[2]; - } + + int num_ghost_comm_ranks = ghost_comm_ranks.size(); + std::vector ghost_comm_ranks_vec(num_ghost_comm_ranks); + int i = 0; + for (const auto &r : ghost_comm_ranks) { + ghost_comm_ranks_vec[i] = r; + i++; } - intermediate_node.coords.update_device(); - // Connectivity rebuild - intermediate_mesh.build_connectivity(); + MPI_Barrier(MPI_COMM_WORLD); + output_mesh.num_boundary_elems = boundary_elem_local_ids.size(); + output_mesh.boundary_elem_local_ids = DCArrayKokkos(output_mesh.num_boundary_elems); + for (int i = 0; i < output_mesh.num_boundary_elems; i++) { + output_mesh.boundary_elem_local_ids.host(i) = boundary_elem_local_ids[i]; + } + output_mesh.boundary_elem_local_ids.update_device(); + print_info = false; + + + MPI_Barrier(MPI_COMM_WORLD); -// ****************************************************************************************** -// Build the ghost elements and nodes -// ================================================================================================** -// -// OVERVIEW OF GHOST ELEMENT IDENTIFICATION: -// ========================================== -// In distributed memory parallel computing with MPI, each processor (rank) owns a subset of mesh -// elements. However, to perform computations that depend on element neighbors or to maintain -// consistency at domain boundaries, we need ghost elements: copies of elements from neighboring -// ranks that share nodes with our locally-owned elements. -// -// This algorithm identifies and extracts ghost element data in 5 steps: -// 1. Gather ownership information: Which rank owns which elements (via MPI_Allgatherv) -// 2. Collect local element-node connectivity for distribution -// 3. Broadcast connectivity to all ranks (via MPI_Allgatherv) -// 4. Identify which remote elements touch our local elements -// 5. Extract the full connectivity data for identified ghost elements -// -// KEY DATA STRUCTURES: -// - elem_gid_to_rank: Map from element global ID to owning rank -// - all_elem_gids: Every element GID from every rank (on every rank) -// - all_conn: Flattened (elem_gid, node_gid) pairs from every rank (on every rank) -// - ghost_elem_gids: Set of remote element GIDs that are ghosts for this rank -// - ghost_elem_to_nodes: Map from ghost element GID to its node GIDs -// -// WHY THIS APPROACH? -// - MPI_Allgatherv is efficient for gathering all data to all ranks -// - Connectivity pairs allow flexible reconstruction of element-node relationships -// - Using sets and maps for efficient lookups (O(log n) instead of O(n)) -// - Distributed computation avoids a single bottleneck rank -// - double t_ghost_start = MPI_Wtime(); - // ======================================================================== - // STEP 1: Gather element ownership information from all ranks - // ======================================================================== - // In a distributed mesh, each rank owns a subset of elements. To identify - // ghost elements (elements from other ranks needed by this rank), we need - // to know which rank owns each element. This section uses MPI collective - // operations to gather element GID ownership information. - // - // MPI COLLECTIVE OPERATIONS EXPLAINED: - // ==================================== - // - MPI_Barrier: Synchronizes all ranks; waits until all ranks reach this point - // - MPI_Allgather: Each rank sends one item of data; each rank receives one item from each rank - // Input: Each rank provides local data - // Output: Every rank has data from every rank in order (rank 0's data, rank 1's data, ...) - // - MPI_Allgatherv: Like MPI_Allgather but for variable-sized data - // Input: Each rank provides data of potentially different sizes - // Output: Every rank has all data from all ranks, with displacement arrays specifying where each rank's data goes - // - // COMMUNICATION PATTERN VISUALIZATION: - // Rank 0: elem_count[0] ----> All ranks receive: [elem_count[0], elem_count[1], elem_count[2], ...] - // Rank 1: elem_count[1] / - // Rank 2: elem_count[2] / + // MPI_Dist_graph_create_adjacent creates a distributed graph topology communicator + // that efficiently represents the communication pattern between ranks. + // This allows MPI to optimize communication based on the actual connectivity pattern. - // MPI_Allgather: Each rank sends its element count, every rank receives - // the count from every other rank. Result: elem_counts[r] = number of - // elements owned by rank r. - std::vector elem_counts(world_size); - MPI_Allgather(&intermediate_mesh.num_elems, 1, MPI_INT, elem_counts.data(), 1, MPI_INT, MPI_COMM_WORLD); - MPI_Barrier(MPI_COMM_WORLD); // Synchronize all ranks before proceeding - - // Compute displacements: offset into the global array for each rank's data - // Example: if elem_counts = [100, 150, 120], then - // elem_displs = [0, 100, 250] (where each rank's data starts in all_elem_gids) - std::vector elem_displs(world_size); - int total_elems = 0; - for (int r = 0; r < world_size; r++) { - elem_displs[r] = total_elems; - total_elems += elem_counts[r]; - } - - // MPI_Allgatherv: Gather variable-sized data from all ranks into one array - // Each rank contributes its local_to_global_elem_mapping, which maps - // local element indices to global element GIDs. After this call, - // all_elem_gids contains ALL element GIDs from all ranks, organized by rank. - std::vector all_elem_gids(total_elems); - MPI_Allgatherv(intermediate_mesh.local_to_global_elem_mapping.host_pointer(), intermediate_mesh.num_elems, MPI_UNSIGNED_LONG_LONG, - all_elem_gids.data(), elem_counts.data(), elem_displs.data(), - MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); - MPI_Barrier(MPI_COMM_WORLD); - - // Build a lookup map: element GID -> owning rank - // This allows O(log n) lookups to determine which rank owns any given element. - std::map elem_gid_to_rank; - for (int r = 0; r < world_size; r++) { - for (int i = 0; i < elem_counts[r]; i++) { - size_t gid = all_elem_gids[elem_displs[r] + i]; - elem_gid_to_rank[gid] = r; - } - } - - // ======================================================================== - // STEP 2: Build element-to-node connectivity for local elements - // ======================================================================== - // Ghost elements are elements from other ranks that share nodes with our - // locally-owned elements. To identify them, we need to exchange element-node - // connectivity information with all other ranks. - - // Collect all nodes that belong to our locally-owned elements - // This set will be used later to check if a remote element is relevant - std::set local_elem_nodes; - for(int node_rid = 0; node_rid < intermediate_mesh.num_nodes; node_rid++) { - size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_rid); - local_elem_nodes.insert(node_gid); - } - // ======================================================================== - // STEP 3: Exchange element-to-node connectivity via MPI_Allgatherv - // ======================================================================== - // Build a flattened connectivity array: pairs of (elem_gid, node_gid) - // Example for 2 elements with 8 nodes each: - // elem_node_conn = [elem0_gid, node0, elem0_gid, node1, ..., elem1_gid, node0, ...] - // - // This format is chosen because it's easy to serialize and deserialize over MPI, - // and allows us to reconstruct the full element-node relationships. - std::vector elem_node_conn; - int local_conn_size = 0; + // ---------- Prepare INCOMING edges (sources) ---------- + // indegree: Number of ranks from which this rank will RECEIVE data + // These are the ranks that own elements which are ghosted on this rank + std::vector ghost_elem_receive_ranks_vec(ghost_elem_receive_ranks.begin(), + ghost_elem_receive_ranks.end()); + // The number of ranks from which this rank will receive data (incoming neighbors) + int indegree = static_cast(ghost_elem_receive_ranks_vec.size()); - // For each locally-owned element, record its GID and all its node GIDs - for (int lid = 0; lid < intermediate_mesh.num_elems; lid++) { - size_t elem_gid = intermediate_mesh.local_to_global_elem_mapping.host(lid); - - // Access nodes_in_elem[lid][*] to get all nodes in this element - for (int j = 0; j < intermediate_mesh.num_nodes_in_elem; j++) { - size_t node_lid = intermediate_mesh.nodes_in_elem.host(lid, j); // Local index - size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid); // Global index - - elem_node_conn.push_back(elem_gid); - elem_node_conn.push_back(node_gid); - } - local_conn_size += nodes_per_elem * 2; // Each element contributes (num_nodes_in_elem * 2) size_ts - } + // sources: Array of source rank IDs (ranks we receive from) + // Each element corresponds to a rank that owns elements we ghost + int* sources = (indegree > 0) ? ghost_elem_receive_ranks_vec.data() : MPI_UNWEIGHTED; - - - // ======================================================================== - // Perform MPI communication to gather connectivity from all ranks - // ======================================================================== - // Similar to Step 1, we use MPI_Allgatherv to collect all element-node - // connectivity pairs. This is a two-stage process: - // 1) Gather the size of each rank's connectivity data - // 2) Gather the actual connectivity data with proper offsets - // Stage 1: Gather connectivity sizes from each rank - // conn_sizes[r] = number of size_t values that rank r will send - std::vector conn_sizes(world_size); - MPI_Allgather(&local_conn_size, 1, MPI_INT, conn_sizes.data(), 1, MPI_INT, MPI_COMM_WORLD); - MPI_Barrier(MPI_COMM_WORLD); + // sourceweights: Weights on incoming edges (not used here, set to MPI_UNWEIGHTED) + // Could be used to specify communication volume if needed for optimization + int* sourceweights = MPI_UNWEIGHTED; - // Compute displacements for the second MPI_Allgatherv call - // Displcements tell each rank where its data should be placed in the global array - std::vector conn_displs(world_size); - int total_conn = 0; - for (int r = 0; r < world_size; r++) { - conn_displs[r] = total_conn; - total_conn += conn_sizes[r]; - } + // ---------- Prepare OUTGOING edges (destinations) ---------- + // outdegree: Number of ranks to which this rank will SEND data + // These are the ranks that ghost elements owned by this rank + int outdegree = num_ghost_comm_ranks; - // Stage 2: Gather all element-node connectivity data - // After this call, all_conn contains the flattened connectivity from every rank, - // organized by rank. Access data from rank r using indices [conn_displs[r], conn_displs[r] + conn_sizes[r]) - std::vector all_conn(total_conn); - MPI_Allgatherv(elem_node_conn.data(), local_conn_size, MPI_UNSIGNED_LONG_LONG, - all_conn.data(), conn_sizes.data(), conn_displs.data(), - MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); + // destinations: Array of destination rank IDs (ranks we send to) + // Each element corresponds to a rank that ghosts our owned elements + int* destinations = (outdegree > 0) ? ghost_comm_ranks_vec.data() : MPI_UNWEIGHTED; + + // Initialize the graph communicator for element communication + element_communication_plan.initialize_graph_communicator(outdegree, ghost_comm_ranks_vec.data(), indegree, ghost_elem_receive_ranks_vec.data()); MPI_Barrier(MPI_COMM_WORLD); - // ======================================================================== - // STEP 4: Identify ghost elements - // ======================================================================== - // A ghost element is an element owned by another rank that shares at least - // one node with our locally-owned elements. This step identifies all such elements. + // Optional: Verify the graph communicator was created successfully + // if(print_info) element_communication_plan.verify_graph_communicator(); + + // ****************************************************************************************** +// Build send counts and displacements for element communication +// ****************************************************************************************** + + // ========== Build send counts and displacements for OUTGOING neighbors (destinations) ========== + // For MPI_Neighbor_alltoallv with graph communicator: + // - elem_sendcounts[i] = number of elements to send to i-th outgoing neighbor (destinations_out[i]) + // - elem_sdispls[i] = starting position in send buffer for i-th outgoing neighbor - // Build a set of locally-owned element GIDs for quick lookup - std::set local_elem_gids; - for (int i = 0; i < intermediate_mesh.num_elems; i++) { - local_elem_gids.insert(intermediate_mesh.local_to_global_elem_mapping.host(i)); - } + // std::vector elem_sendcounts(element_communication_plan.num_send_ranks, 0); + // std::vector elem_sdispls(element_communication_plan.num_send_ranks, 0); - // Build a temporary map: node GID -> set of element GIDs (from other ranks) that contain it - // This helps us identify which remote elements are adjacent to our local elements - std::map> node_to_ext_elem; + // Count how many boundary elements go to each destination rank + // boundary_elem_targets[elem_lid] contains pairs (dest_rank, elem_gid) for each boundary element + std::map> elems_to_send_by_rank; // rank -> list of boundary element local IDs - // Iterate through connectivity data from each rank (except ourselves) - for (int r = 0; r < world_size; r++) { - if (r == rank) continue; // Skip our own data - we already know our elements - - // Parse the connectivity data for rank r - // Data format: [elem0_gid, node0, elem0_gid, node1, ..., elem1_gid, node0, ...] - // Each pair is 2 size_ts, so num_pairs = conn_sizes[r] / 2 - int num_pairs = conn_sizes[r] / 2; - - for (int i = 0; i < num_pairs; i++) { - // Offset into all_conn for this pair (elem_gid, node_gid) - int offset = conn_displs[r] + i * 2; - size_t elem_gid = all_conn[offset]; - size_t node_gid = all_conn[offset + 1]; - - // Check if this node belongs to one of our locally-owned elements - if (local_elem_nodes.find(node_gid) != local_elem_nodes.end()) { - // Check if this element is NOT owned by us (i.e., it's from another rank) - if (local_elem_gids.find(elem_gid) == local_elem_gids.end()) { - // This is a ghost element for us - node_to_ext_elem[node_gid].insert(elem_gid); - } + for (int elem_lid = 0; elem_lid < input_mesh.num_elems; elem_lid++) { + if (!boundary_elem_targets[elem_lid].empty()) { + for (const auto &pr : boundary_elem_targets[elem_lid]) { + int dest_rank = pr.first; + elems_to_send_by_rank[dest_rank].push_back(elem_lid); } } } - - // Extract all unique ghost element GIDs - // We use a set to eliminate duplicates (same ghost element might share multiple nodes with us) - std::set ghost_elem_gids; - for (const auto& pair : node_to_ext_elem) { - for (size_t elem_gid : pair.second) { - ghost_elem_gids.insert(elem_gid); + + // Serialize into a DRaggedRightArrayKokkos + CArrayKokkos strides_array(element_communication_plan.num_send_ranks); + for (int i = 0; i < element_communication_plan.num_send_ranks; i++) { + int dest_rank = element_communication_plan.send_rank_ids.host(i); + strides_array(i) = elems_to_send_by_rank[dest_rank].size(); + } + DRaggedRightArrayKokkos elems_to_send_by_rank_rr(strides_array, "elems_to_send_by_rank"); + + // Fill in the data + for (int i = 0; i < element_communication_plan.num_send_ranks; i++) { + int dest_rank = element_communication_plan.send_rank_ids.host(i); + for (int j = 0; j < elems_to_send_by_rank[dest_rank].size(); j++) { + elems_to_send_by_rank_rr.host(i, j) = elems_to_send_by_rank[dest_rank][j]; } } + elems_to_send_by_rank_rr.update_device(); + - // Additional check: elements that are neighbors of our locally-owned elements - // but are owned by other ranks (these might already be in ghost_elem_gids, but check connectivity) - - // for (int lid = 0; lid < num_new_elems; lid++) { - // size_t num_neighbors = intermediate_mesh.num_elems_in_elem(lid); - - // for (size_t nbr_idx = 0; nbr_idx < num_neighbors; ++nbr_idx) { - // size_t neighbor_lid = intermediate_mesh.elems_in_elem(lid, nbr_idx); - - // if (neighbor_lid < static_cast(num_new_elems)) { - // size_t neighbor_gid = intermediate_mesh.local_to_global_elem_mapping(neighbor_lid); - - // // Check if neighbor is owned by this rank - // auto it = elem_gid_to_rank.find(neighbor_gid); - // if (it != elem_gid_to_rank.end() && it->second != rank) { - // // Neighbor is owned by another rank - it's a ghost for us - // std::cout << "[rank " << rank << "] found ghost element " << neighbor_gid << std::endl; - // ghost_elem_gids.insert(neighbor_gid); - // } - // } - // } - // } - - // Store the count of ghost elements for later use - intermediate_mesh.num_ghost_elems = ghost_elem_gids.size(); - - MPI_Barrier(MPI_COMM_WORLD); - - - // ======================================================================== - // STEP 5: Extract ghost element connectivity - // ======================================================================== - // Now that we know which elements are ghosts, we need to extract their - // full node connectivity from all_conn. This allows us to properly construct - // the extended mesh with ghost elements included. - - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout << " Starting to build extended mesh with ghost elements" << std::endl; + // Count how many ghost elements come from each source rank + // ghost_elem_owner_ranks[i] tells us which rank owns the i-th ghost element + std::map> elems_to_recv_by_rank; // rank -> list of ghost element indices - // Build a map: ghost_elem_gid -> vector of node_gids - // We pre-allocate the vector size to avoid repeated reallocations - std::map> ghost_elem_to_nodes; - for (const size_t& ghost_gid : ghost_elem_gids) { - ghost_elem_to_nodes[ghost_gid].reserve(intermediate_mesh.num_nodes_in_elem); + for (size_t i = 0; i < ghost_elem_owner_ranks.size(); i++) { + int source_rank = ghost_elem_owner_ranks[i]; + int ghost_elem_local_id = output_mesh.num_owned_elems + i; + elems_to_recv_by_rank[source_rank].push_back(ghost_elem_local_id); } - - // ======================================================================== - // Extract nodes for each ghost element from the globally-collected all_conn - // ======================================================================== - // The all_conn array was populated by MPI_Allgatherv and contains connectivity - // pairs (elem_gid, node_gid) for all elements from all ranks. We now parse - // this data to extract the nodes for each ghost element. - for (int r = 0; r < world_size; r++) { - if (r == rank) continue; // Skip our own data - we already have owned element connectivity - - // Parse connectivity data for rank r - int num_pairs = conn_sizes[r] / 2; - - for (int i = 0; i < num_pairs; i++) { - // Calculate offset for this pair: displacement + (pair_index * 2) - int offset = conn_displs[r] + i * 2; - size_t elem_gid = all_conn[offset]; - size_t node_gid = all_conn[offset + 1]; - - // If this element is one of our identified ghost elements, record its node - auto it = ghost_elem_to_nodes.find(elem_gid); - if (it != ghost_elem_to_nodes.end()) { - it->second.push_back(node_gid); - } - } + + // ========== Serialize into a DRaggedRightArrayKokkos ========== + CArrayKokkos elem_recv_strides_array(element_communication_plan.num_recv_ranks); + for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) { + int source_rank = element_communication_plan.recv_rank_ids.host(i); + elem_recv_strides_array(i) = elems_to_recv_by_rank[source_rank].size(); + } - - // ======================================================================== - // Validation: Verify each ghost element has the correct number of nodes - // ======================================================================== - // This catch detects issues in the MPI communication or parsing logic - for (auto& pair : ghost_elem_to_nodes) { - if (pair.second.size() != static_cast(intermediate_mesh.num_nodes_in_elem)) { - std::cerr << "[rank " << rank << "] ERROR: Ghost element " << pair.first - << " has " << pair.second.size() << " nodes, expected " << intermediate_mesh.num_nodes_in_elem << std::endl; + DRaggedRightArrayKokkos elems_to_recv_by_rank_rr(elem_recv_strides_array, "elems_to_recv_by_rank"); + // Fill in the data + for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) { + int source_rank = element_communication_plan.recv_rank_ids.host(i); + for (int j = 0; j < elems_to_recv_by_rank[source_rank].size(); j++) { + elems_to_recv_by_rank_rr.host(i, j) = elems_to_recv_by_rank[source_rank][j]; } } - - // Step 2: Build extended node list (owned nodes first, then ghost-only nodes) - // Start with owned nodes - std::map node_gid_to_extended_lid; - int extended_node_lid = 0; - - // Add all owned nodes - for (int i = 0; i < intermediate_mesh.num_nodes; i++) { - size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(i); - node_gid_to_extended_lid[node_gid] = extended_node_lid++; - } - - // Add ghost-only nodes (nodes that belong to ghost elements but not to owned elements) - std::set ghost_only_nodes; - for (const auto& pair : ghost_elem_to_nodes) { - for (size_t node_gid : pair.second) { - // Check if we already have this node - if (node_gid_to_extended_lid.find(node_gid) == node_gid_to_extended_lid.end()) { - ghost_only_nodes.insert(node_gid); - } - } + elems_to_recv_by_rank_rr.update_device(); + element_communication_plan.setup_send_recv(elems_to_send_by_rank_rr, elems_to_recv_by_rank_rr); + + MPI_Barrier(MPI_COMM_WORLD); + + +} + + +/** + * @brief Partitions the input mesh using PT-Scotch and constructs the final distributed mesh. + * + * This function performs parallel mesh partitioning using a two-stage approach: + * 1. A naive partition is first constructed (simple assignment of mesh elements/nodes across ranks). + * 2. PT-Scotch is then used to repartition the mesh for load balancing and improved connectivity. + * + * The partitioned mesh, nodal data, and associated connectivity/gauss point information + * are distributed among MPI ranks as a result. The procedure ensures that each rank receives + * its assigned portion of the mesh and associated data in the final (target) decomposition. + * + * @param initial_mesh[in] The input (global) mesh, present on rank 0 or all ranks at start. + * @param final_mesh[out] The mesh assigned to this rank after PT-Scotch decomposition. + * @param initial_node[in] Nodal data for the input (global) mesh; must match initial_mesh. + * @param final_node[out] Nodal data for this rank after decomposition (corresponds to final_mesh). + * @param gauss_point[out] Gauss point data structure, filled out for this rank's mesh. + * @param world_size[in] Number of MPI ranks in use (the total number of partitions). + * @param rank[in] This process's MPI rank ID. + * + * Internals: + * - The routine uses a naive_partition_mesh() helper to create an initial contiguous mesh partition. + * - It then uses PT-Scotch distributed graph routines to compute an improved partition and create the final mesh layout. + * - Both element-to-element and node-to-element connectivity, as well as mapping and ghosting information, + * are managed and exchanged across ranks. + * - MPI routines synchronize and exchange the relevant mesh and nodal data following the computed partition. + */ + +void partition_mesh( + Mesh_t& initial_mesh, + Mesh_t& final_mesh, + node_t& initial_node, + node_t& final_node, + GaussPoint_t& gauss_point, + int world_size, + int rank){ + + bool print_info = false; + bool print_vtk = false; + + // Create mesh, gauss points, and node data structures on each rank + // This is the initial partitioned mesh + Mesh_t naive_mesh; + node_t naive_node; + + // Mesh partitioned by pt-scotch, not including ghost + Mesh_t intermediate_mesh; + node_t intermediate_node; + + + // Helper arrays to hold element-element connectivity for naive partitioning that include what would be ghost, without having to build the full mesh + std::vector elems_in_elem_on_rank; + std::vector num_elems_in_elem_per_rank; + + + // Perform the naive partitioning of the mesh + naive_partition_mesh(initial_mesh, initial_node, naive_mesh, naive_node, elems_in_elem_on_rank, num_elems_in_elem_per_rank, world_size, rank); + + + /********************************************************************************** + * Build PT-Scotch distributed graph representation of the mesh for repartitioning * + ********************************************************************************** + * + * This section constructs the distributed graph (SCOTCH_Dgraph) needed by PT-Scotch + * for mesh repartitioning. In this graph, each mesh element is a vertex, and edges + * correspond to mesh-neighbor relationships (i.e., elements that share a face or are + * otherwise neighbors per your mesh definition). + * + * We use the compact CSR (Compressed Sparse Row) representation, passing only the + * essential information required by PT-Scotch. + * + * Variables and structures used: + * - SCOTCH_Dgraph dgraph: + * The distributed graph instance managed by PT-Scotch. Each MPI rank creates + * and fills in its portion of the global graph. + * + * - const SCOTCH_Num baseval: + * The base value for vertex and edge numbering. Set to 0 for C-style zero-based + * arrays. Always use 0 unless you are using Fortran style 1-based arrays. + * + * - const SCOTCH_Num vertlocnbr: + * The *number of local vertices* (mesh elements) defined on this MPI rank. + * In our mesh, this is mesh.num_elems. PT-Scotch expects each rank to specify + * its own local vertex count. + * + * - const SCOTCH_Num vertlocmax: + * The *maximum number of local vertices* that could be stored (capacity). We + * allocate with no unused holes, so vertlocmax = vertlocnbr. + * + * - std::vector vertloctab: + * CSR array [size vertlocnbr+1]: for each local vertex i, vertloctab[i] + * gives the index in edgeloctab where the neighbor list of vertex i begins. + * PT-Scotch expects this array to be of size vertlocnbr+1, where the difference + * vertloctab[i+1] - vertloctab[i] gives the number of edges for vertex i. + * + * - std::vector edgeloctab: + * CSR array [variable size]: a flattened list of *neighboring element global IDs*, + * in no particular order. For vertex i, its neighbors are located at + * edgeloctab[vertloctab[i]...vertloctab[i+1]-1]. + * In this compact CSR, these are global IDs (GIDs), enabling PT-Scotch to + * recognize edges both within and across ranks. + * + * - std::map elem_gid_to_offset: + * Helper map: For a given element global ID, gives the starting offset in + * the flattened neighbor array (elems_in_elem_on_rank) where this element's + * list of neighbor GIDs begins. This allows efficient neighbor list lookup. + * + * - (other arrays used, from mesh setup and communication phase) + * - elements_on_rank: vector of global element IDs owned by this rank. + * - num_elements_on_rank: number of owned elements. + * - num_elems_in_elem_per_rank: array, for each owned element, how many + * neighbors it has. + * - elems_in_elem_on_rank: flattened array of global neighbor IDs for all local elements. + * + **********************************************************************************/ + + // --- Step 1: Initialize the PT-Scotch distributed graph object on this MPI rank --- + SCOTCH_Dgraph dgraph; + if (SCOTCH_dgraphInit(&dgraph, MPI_COMM_WORLD) != 0) { + std::cerr << "[rank " << rank << "] SCOTCH_dgraphInit failed\n"; + MPI_Abort(MPI_COMM_WORLD, 1); } - - // Assign extended local IDs to ghost-only nodes - for (size_t node_gid : ghost_only_nodes) { - node_gid_to_extended_lid[node_gid] = extended_node_lid++; + + // Set base value for numbering (0 for C-style arrays) + const SCOTCH_Num baseval = 0; + + // vertlocnbr: Number of elements (vertices) that are local to this MPI rank + const SCOTCH_Num vertlocnbr = static_cast(naive_mesh.num_elems); + + // vertlocmax: Maximum possible local vertices (no holes, so identical to vertlocnbr) + const SCOTCH_Num vertlocmax = vertlocnbr; + + // --- Step 2: Build compact CSR arrays for PT-Scotch (vertloctab, edgeloctab) --- + // vertloctab: for each local mesh element [vertex], gives index in edgeloctab where its neighbor list begins + std::vector vertloctab(vertlocnbr + 1); + + // edgeloctab: flat array of neighbor global IDs for all local elements, built in order + std::vector edgeloctab; + edgeloctab.reserve(vertlocnbr * 6); // heuristic: assume typical mesh degree is ~6, for performance + + // Construct a map from element GID to its offset into elems_in_elem_on_rank (the array of neighbor GIDs) + // This allows, for a given element GID, quick lookup of where its neighbor list starts in the flat array. + std::map elem_gid_to_offset; + size_t current_offset = 0; + for (size_t k = 0; k < naive_mesh.num_elems; k++) { + int elem_gid_on_rank = naive_mesh.local_to_global_elem_mapping.host(k); + elem_gid_to_offset[elem_gid_on_rank] = current_offset; + current_offset += num_elems_in_elem_per_rank[k]; // WARNING< THIS MUST INCLUDE GHOST< WHICH DONT EXISTS ON THE NAIVE MESH } - - int total_extended_nodes = extended_node_lid; - - // Step 3: Prepare requests for ghost node coordinates from owning ranks (if needed later) - // Build request list: for each ghost node, find an owning rank via any ghost element that contains it - std::map> rank_to_ghost_node_requests; - for (size_t node_gid : ghost_only_nodes) { - // Find which rank owns an element containing this node - // Look through ghost elements - for (const auto& pair : ghost_elem_to_nodes) { - size_t ghost_elem_gid = pair.first; - const std::vector& nodes = pair.second; - bool found = false; - for (size_t ngid : nodes) { - if (ngid == node_gid) { - found = true; - break; - } - } - if (found) { - auto owner_it = elem_gid_to_rank.find(ghost_elem_gid); - if (owner_it != elem_gid_to_rank.end()) { - rank_to_ghost_node_requests[owner_it->second].push_back(node_gid); - break; - } + + // --- Step 3: Fill in the CSR arrays, looping over each locally-owned element --- + SCOTCH_Num offset = 0; // running count of edges encountered + + for (size_t lid = 0; lid < naive_mesh.num_elems; lid++) { + + // Record current edge offset for vertex lid in vertloctab + vertloctab[lid] = offset; + + // Obtain this local element's global ID (from mapping) + int elem_gid = naive_mesh.local_to_global_elem_mapping.host(lid); + + // Find offset in the flattened neighbor array for this element's neighbor list + size_t elems_in_elem_offset = elem_gid_to_offset[elem_gid]; + + // For this element, find the count of its neighbors + // This requires finding its index in the elements_on_rank array + size_t idx = 0; + for (size_t k = 0; k < naive_mesh.num_elems; k++) { + int elem_gid_on_rank = naive_mesh.local_to_global_elem_mapping.host(k); + if (elem_gid_on_rank == elem_gid) { + idx = k; + break; } } - } - - // Step 4: Build extended element list and node connectivity - // Owned elements: 0 to num_new_elems-1 (already have these) - // Ghost elements: num_new_elems to num_new_elems + num_ghost_elems - 1 - - // Create extended element-node connectivity array - int total_extended_elems = intermediate_mesh.num_elems + intermediate_mesh.num_ghost_elems; - std::vector> extended_nodes_in_elem(total_extended_elems); - - // Copy owned element connectivity (convert to extended node LIDs) - for (int lid = 0; lid < intermediate_mesh.num_elems; lid++) { - extended_nodes_in_elem[lid].reserve(nodes_per_elem); - for (int j = 0; j < nodes_per_elem; j++) { - size_t node_lid = intermediate_mesh.nodes_in_elem.host(lid, j); - size_t node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid); - int ext_lid = node_gid_to_extended_lid[node_gid]; - extended_nodes_in_elem[lid].push_back(ext_lid); + size_t num_nbrs = num_elems_in_elem_per_rank[idx]; + + // Append each neighbor (by its GLOBAL elem GID) to edgeloctab + for (size_t j = 0; j < num_nbrs; j++) { + size_t neighbor_gid = elems_in_elem_on_rank[elems_in_elem_offset + j]; // This is a global element ID! + edgeloctab.push_back(static_cast(neighbor_gid)); + ++offset; // Increment running edge count } } - - // Add ghost element connectivity (map ghost node GIDs to extended node LIDs) - int ghost_elem_ext_lid = intermediate_mesh.num_elems; - std::vector ghost_elem_gids_ordered(ghost_elem_gids.begin(), ghost_elem_gids.end()); - std::sort(ghost_elem_gids_ordered.begin(), ghost_elem_gids_ordered.end()); - - for (size_t ghost_gid : ghost_elem_gids_ordered) { - auto it = ghost_elem_to_nodes.find(ghost_gid); - if (it == ghost_elem_to_nodes.end()) continue; - - extended_nodes_in_elem[ghost_elem_ext_lid].reserve(nodes_per_elem); - for (size_t node_gid : it->second) { - int ext_lid = node_gid_to_extended_lid[node_gid]; - extended_nodes_in_elem[ghost_elem_ext_lid].push_back(ext_lid); + + // vertloctab[vertlocnbr] stores total number of edges written, finalizes the CSR structure + vertloctab[vertlocnbr] = offset; + + // edgelocnbr/edgelocsiz: Number of edge endpoints defined locally + // (PT-Scotch's distributed graphs allow edges to be replicated or owned by either endpoint) + const SCOTCH_Num edgelocnbr = offset; // total number of edge endpoints (sum of all local neighbor degrees) + const SCOTCH_Num edgelocsiz = edgelocnbr; // allocated size matches number of endpoints + + // Optionally print graph structure for debugging/validation + if (print_info) { + std::cout << "Rank " << rank << ": vertlocnbr = # of local elements(vertices) = " << vertlocnbr + << ", edgelocnbr = # of local edge endpoints = " << edgelocnbr << std::endl; + std::cout << "vertloctab (CSR row offsets): "; + for (size_t i = 0; i <= vertlocnbr; i++) { + std::cout << vertloctab[i] << " "; } - ghost_elem_ext_lid++; - } - - MPI_Barrier(MPI_COMM_WORLD); - // Sequential rank-wise printing of extended mesh structure info - if(print_info) { - for (int r = 0; r < world_size; r++) { - MPI_Barrier(MPI_COMM_WORLD); - if (rank == r) { - std::cout << "[rank " << rank << "] Finished building extended mesh structure" << std::endl; - std::cout << "[rank " << rank << "] - Owned elements: " << intermediate_mesh.num_elems << std::endl; - std::cout << "[rank " << rank << "] - Ghost elements: " << ghost_elem_gids.size() << std::endl; - std::cout << "[rank " << rank << "] - Total extended elements: " << total_extended_elems << std::endl; - std::cout << "[rank " << rank << "] - Owned nodes: " << intermediate_mesh.num_nodes << std::endl; - std::cout << "[rank " << rank << "] - Ghost-only nodes: " << ghost_only_nodes.size() << std::endl; - std::cout << "[rank " << rank << "] - Total extended nodes: " << total_extended_nodes << std::endl; - std::cout << std::flush; - } - MPI_Barrier(MPI_COMM_WORLD); + std::cout << std::endl; + std::cout << "edgeloctab (first 20 neighbor GIDs): "; + for (size_t i = 0; i < std::min((size_t)20, edgeloctab.size()); i++) { + std::cout << edgeloctab[i] << " "; } + std::cout << std::endl; } - // The extended_nodes_in_elem vector now contains the connectivity for both owned and ghost elements - // Each element's nodes are stored using extended local node IDs (0-based, contiguous) - - // Build reverse maps: extended_lid -> gid for nodes and elements - std::vector extended_lid_to_node_gid(total_extended_nodes); - for (const auto& pair : node_gid_to_extended_lid) { - extended_lid_to_node_gid[pair.second] = pair.first; - } - - // Build extended element GID list: owned first, then ghost - std::vector extended_lid_to_elem_gid(total_extended_elems); - // Owned elements - for (int i = 0; i < intermediate_mesh.num_elems; i++) { - extended_lid_to_elem_gid[i] = intermediate_mesh.local_to_global_elem_mapping.host(i); + MPI_Barrier(MPI_COMM_WORLD); + + /************************************************************************** + * Step 4: Build the distributed graph using PT-Scotch's SCOTCH_dgraphBuild + * + * - PT-Scotch will use our CSR arrays. Since we use compact representation, + * most optional arrays ("veloloctab", "vlblloctab", "edgegsttab", "edloloctab") + * can be passed as nullptr. + * - edgeloctab contains *GLOBAL element GIDs* of neighbors. PT-Scotch uses this + * to discover connections across processor boundaries, so you do not have to + * encode ownership or partition information yourself. + **************************************************************************/ + int rc = SCOTCH_dgraphBuild( + &dgraph, + baseval, // start index (0) + vertlocnbr, // local vertex count (local elements) + vertlocmax, // local vertex max (no holes) + vertloctab.data(), // row offsets in edgeloctab + /*vendloctab*/ nullptr, // end of row offsets (compact CSR => nullptr) + /*veloloctab*/ nullptr, // vertex weights, not used + /*vlblloctab*/ nullptr, // vertex global labels (we use GIDs in edgeloctab) + edgelocnbr, // local edge endpoints count + edgelocsiz, // size of edge array + edgeloctab.data(), // global neighbor IDs for each local node + /*edgegsttab*/ nullptr, // ghost edge array, not used + /*edloloctab*/ nullptr // edge weights, not used + ); + if (rc != 0) { + std::cerr << "[rank " << rank << "] SCOTCH_dgraphBuild failed rc=" << rc << "\n"; + SCOTCH_dgraphFree(&dgraph); + MPI_Abort(MPI_COMM_WORLD, rc); } - // Ghost elements (in sorted order) - for (size_t i = 0; i < ghost_elem_gids_ordered.size(); i++) { - extended_lid_to_elem_gid[intermediate_mesh.num_elems + i] = ghost_elem_gids_ordered[i]; + + // Optionally, print rank summary after graph build for further validation + if (print_info) { + SCOTCH_Num vertlocnbr_out; + SCOTCH_dgraphSize(&dgraph, &vertlocnbr_out, nullptr, nullptr, nullptr); + std::cout << "Rank " << rank << ": After dgraphBuild, vertlocnbr = " << vertlocnbr_out << std::endl; } + MPI_Barrier(MPI_COMM_WORLD); - // Build array: for each ghost element, store which rank owns it (where to receive data from) - std::vector ghost_elem_owner_ranks(ghost_elem_gids_ordered.size()); - for (size_t i = 0; i < ghost_elem_gids_ordered.size(); i++) { - size_t ghost_gid = ghost_elem_gids_ordered[i]; - auto it = elem_gid_to_rank.find(ghost_gid); - if (it != elem_gid_to_rank.end()) { - ghost_elem_owner_ranks[i] = it->second; - } else { - std::cerr << "[rank " << rank << "] ERROR: Ghost element GID " << ghost_gid - << " not found in elem_gid_to_rank map!" << std::endl; - ghost_elem_owner_ranks[i] = -1; // Invalid rank as error indicator - } + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" Finished building the distributed graph using PT-Scotch"< ghost_elem_receive_ranks; - for (size_t i = 0; i < ghost_elem_gids_ordered.size(); i++) { - ghost_elem_receive_ranks.insert(ghost_elem_owner_ranks[i]); - } + /************************************************************** + * Step 6: Partition (repartition) the mesh using PT-Scotch + * - Each vertex (mesh element) will be assigned a part (mesh chunk). + * - Arch is initialized for a complete graph of world_size parts (one per rank). + **************************************************************/ + // SCOTCH_Arch controls the "architecture" for partitioning: the topology + // (number and connectivity of parts) to which the graph will be mapped. + // The archdat variable encodes this. Below are common options: + // + // - SCOTCH_archCmplt(&archdat, nbparts) + // * Creates a "complete graph" architecture with nbparts nodes (fully connected). + // Every part is equally distant from every other part. + // This is typically used when minimizing only *balance* and *edge cut*, + // not considering any underlying machine topology. + // + // - SCOTCH_archHcub(&archdat, dimension) + // * Hypercube architecture (rare in modern use). + // Sets up a hypercube of given dimension. + // + // - SCOTCH_archTleaf / SCOTCH_archTleafX + // * Tree architectures, for hierarchically structured architectures. + // + // - SCOTCH_archMesh2 / SCOTCH_archMesh3 + // * 2D or 3D mesh topology architectures (useful for grid/matrix machines). + // + // - SCOTCH_archBuild + // * General: builds any architecture from a descriptor string. + // + // For distributed mesh partitioning to MPI ranks (where all ranks are equal), + // the most common and appropriate is "complete graph" (Cmplt): each part (rank) + // is equally reachable from any other (no communication topology bias). + SCOTCH_Arch archdat; // PT-Scotch architecture structure: describes desired partition topology + SCOTCH_archInit(&archdat); + // Partition into 'world_size' equally connected parts (each MPI rank is a "node") + // Other topology options could be substituted above according to your needs (see docs). + SCOTCH_archCmplt(&archdat, static_cast(world_size)); + -// ****************************************************************************************** -// Build the final partitioned mesh -// ****************************************************************************************** + + // ===================== PT-Scotch Strategy Selection and Documentation ====================== + // The PT-Scotch "strategy" (stratdat here) controls the algorithms and heuristics used for partitioning. + // You can specify a string or build a strategy using functions that adjust speed, quality, and recursion. + // + // Common strategy flags (see "scotch.h", "ptscotch.h", and PT-Scotch documentation): + // + // - SCOTCH_STRATDEFAULT: Use the default (fast, reasonable quality) partitioning strategy. + // Useful for quick, generic partitions where quality is not critical. + // + // - SCOTCH_STRATSPEED: Aggressively maximizes speed (at the cost of cut quality). + // For large runs or test runs where speed is more important than minimizing edgecut. + // + // - SCOTCH_STRATQUALITY: Prioritizes partition *quality* (minimizing edge cuts, maximizing load balance). + // Slower than the default. Use when high-quality partitioning is desired. + // + // - SCOTCH_STRATBALANCE: Tradeoff between speed and quality for balanced workload across partitions. + // Use if load balance is more critical than cut size. + // + // Additional Options: + // - Strategy can also be specified as a string (see Scotch manual, e.g., "b{sep=m{...} ...}"). + // - Recursion count parameter (here, set to 0) controls strategy recursion depth (0 = automatic). + // - Imbalance ratio (here, 0.01) allows minor imbalance in part weight for better cut quality. + // + // Example usage: + // SCOTCH_stratDgraphMapBuild(&strat, SCOTCH_STRATQUALITY, nparts, 0, 0.01); + // ^ quality-focused, nparts=number of parts/ranks + // SCOTCH_stratDgraphMapBuild(&strat, SCOTCH_STRATSPEED, nparts, 0, 0.05); + // ^ speed-focused, allow 5% imbalance + // + // Reference: + // - https://gitlab.inria.fr/scotch/scotch/-/blob/master/doc/libptscotch.pdf + // - SCOTCH_stratDgraphMapBuild() and related "strategy" documentation. + // + // --------------- Set up the desired partitioning strategy here: --------------- + SCOTCH_Strat stratdat; // PT-Scotch strategy object: holds partitioning options/settings + SCOTCH_stratInit(&stratdat); + + // Select partitioning strategy for this run: + // Use SCOTCH_STRATQUALITY for best cut quality. + // To change: replace with SCOTCH_STRATDEFAULT, SCOTCH_STRATSPEED, or SCOTCH_STRATBALANCE as discussed above. + // Arguments: (strategy object, strategy flag, #parts, recursion (0=auto), imbalance ratio) + SCOTCH_stratDgraphMapBuild(&stratdat, SCOTCH_STRATQUALITY, world_size, 0, 0.001); - final_mesh.initialize_nodes(total_extended_nodes); - final_mesh.initialize_elems(total_extended_elems, 3); - final_mesh.local_to_global_node_mapping = DCArrayKokkos(total_extended_nodes); - final_mesh.local_to_global_elem_mapping = DCArrayKokkos(total_extended_elems); - for (int i = 0; i < total_extended_nodes; i++) { - final_mesh.local_to_global_node_mapping.host(i) = extended_lid_to_node_gid[i]; - } - for (int i = 0; i < total_extended_elems; i++) { - final_mesh.local_to_global_elem_mapping.host(i) = extended_lid_to_elem_gid[i]; + // partloctab: output array mapping each local element (vertex) to a *target partition number* + // After partitioning, partloctab[i] gives the part-assignment (in [0,world_size-1]) for local element i. + std::vector partloctab(vertlocnbr); + rc = SCOTCH_dgraphMap(&dgraph, &archdat, &stratdat, partloctab.data()); + if (rc != 0) { + std::cerr << "[rank " << rank << "] SCOTCH_dgraphMap failed rc=" << rc << "\n"; + SCOTCH_stratExit(&stratdat); + SCOTCH_archExit(&archdat); + SCOTCH_dgraphFree(&dgraph); + MPI_Abort(MPI_COMM_WORLD, rc); } - final_mesh.local_to_global_node_mapping.update_device(); - final_mesh.local_to_global_elem_mapping.update_device(); - final_mesh.num_ghost_elems = ghost_elem_gids.size(); - final_mesh.num_ghost_nodes = ghost_only_nodes.size(); + // Clean up PT-Scotch strategy and architecture objects + SCOTCH_stratExit(&stratdat); + SCOTCH_archExit(&archdat); + // Free the graph now that we have the partition assignments + SCOTCH_dgraphFree(&dgraph); - final_mesh.num_owned_elems = intermediate_mesh.num_elems; - final_mesh.num_owned_nodes = intermediate_mesh.num_nodes; - - MPI_Barrier(MPI_COMM_WORLD); - // rebuild the local element-node connectivity using the local node ids - // extended_nodes_in_elem already contains extended local node IDs, so we can use them directly - for(int i = 0; i < total_extended_elems; i++) { - for(int j = 0; j < nodes_per_elem; j++) { - final_mesh.nodes_in_elem.host(i, j) = extended_nodes_in_elem[i][j]; + /*************************************************************************** + * Step 7 (Optional): Print out the partitioning assignment per element + * - Each local element's local index lid and global ID (gid) are listed with the + * part to which PT-Scotch has assigned them. + ***************************************************************************/ + print_info = false; + for(int rank_id = 0; rank_id < world_size; rank_id++) { + if(rank_id == rank && print_info) { + for (size_t lid = 0; lid < naive_mesh.num_elems; lid++) { + size_t gid = naive_mesh.local_to_global_elem_mapping.host(lid); + std::cout << "[rank " << rank_id << "] elem_local=" << lid << " gid=" << gid + << " -> part=" << partloctab[lid] << "\n"; + } + MPI_Barrier(MPI_COMM_WORLD); } + MPI_Barrier(MPI_COMM_WORLD); } + print_info = false; - MPI_Barrier(MPI_COMM_WORLD); +// ****************************************************************************************** +// Build the intermediate mesh (without ghost nodes and elements) from the repartition +// ****************************************************************************************** - double t_ghost_end = MPI_Wtime(); - - if (rank == 0) { - std::cout << " Finished calculating ghost elements" << std::endl; - std::cout << " Ghost element calculation took " << (t_ghost_end - t_ghost_start) << " seconds." << std::endl; - } - final_mesh.nodes_in_elem.update_device(); - final_mesh.build_connectivity(); MPI_Barrier(MPI_COMM_WORLD); - - if(rank == 0) std::cout << " Finished building final mesh structure with ghost nodes and elements" << std::endl; + if (rank == 0) std::cout << "\n=== Starting Mesh Redistribution Phase ===\n"; MPI_Barrier(MPI_COMM_WORLD); -// ****************************************************************************************** -// Build the final nodes that include ghost -// ****************************************************************************************** - - - final_node.initialize(total_extended_nodes, 3, {node_state::coords}); - - // The goal here is to populate final_node.coords using globally gathered ghost node coordinates, - // since intermediate_node does not contain ghost node coordinates. - // - // Each rank will: - // 1. Gather coordinates of its owned nodes (from intermediate_node). - // 2. Use MPI to gather all coordinates for all required (owned + ghost) global node IDs - // into a structure mapping global ID -> coordinate. - // 3. Use this map to fill final_node.coords. - - // 1. Build list of all global node IDs needed on this rank (owned + ghosts) - std::vector all_needed_node_gids(total_extended_nodes); - for (int i = 0; i < total_extended_nodes; i++) { - all_needed_node_gids[i] = final_mesh.local_to_global_node_mapping.host(i); + // -------------- Phase 1: Determine elements to send to each rank -------------- + std::vector> elems_to_send(world_size); + for (int lid = 0; lid < naive_mesh.num_elems; lid++) { + int dest = static_cast(partloctab[lid]); + int elem_gid = static_cast(naive_mesh.local_to_global_elem_mapping.host(lid)); + elems_to_send[dest].push_back(elem_gid); } - // 2. Build owned node GIDs and their coordinates - std::vector owned_gids(final_mesh.num_owned_nodes); - for (int i = 0; i < final_mesh.num_owned_nodes; i++) - owned_gids[i] = final_mesh.local_to_global_node_mapping.host(i); - - // 3. Gather all GIDs in the world that are needed anywhere (owned or ghosted, by any rank) - // so we can distribute the needed coordinate data. - // The easiest is to Allgather everyone's "owned_gids" and coords - - int local_owned_count = static_cast(owned_gids.size()); - std::vector owned_counts(world_size, 0); - if (local_owned_count < 0) local_owned_count = 0; // Clean up possibility of -1 + // -------------- Phase 2: Exchange element GIDs -------------- + std::vector sendcounts(world_size), recvcounts(world_size); + for (int r = 0; r < world_size; r++) + sendcounts[r] = static_cast(elems_to_send[r].size()); - // a) Gather counts - owned_counts.resize(world_size, 0); - MPI_Allgather(&local_owned_count, 1, MPI_INT, owned_counts.data(), 1, MPI_INT, MPI_COMM_WORLD); + MPI_Alltoall(sendcounts.data(), 1, MPI_INT, recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD); - // b) Displacements and total - std::vector owned_displs(world_size,0); - int total_owned = 0; + MPI_Barrier(MPI_COMM_WORLD); + + // Compute displacements + std::vector sdispls(world_size), rdispls(world_size); + int send_total = 0, recv_total = 0; for (int r = 0; r < world_size; r++) { - owned_displs[r] = total_owned; - total_owned += owned_counts[r]; + sdispls[r] = send_total; + rdispls[r] = recv_total; + send_total += sendcounts[r]; + recv_total += recvcounts[r]; } - // c) Global GIDs (size: total_owned) - std::vector all_owned_gids(total_owned); - MPI_Allgatherv(owned_gids.data(), local_owned_count, MPI_UNSIGNED_LONG_LONG, - all_owned_gids.data(), owned_counts.data(), owned_displs.data(), - MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); - - // d) Global coords (size: total_owned x 3) - std::vector owned_coords_send(3*local_owned_count, 0.0); - for (int i = 0; i < local_owned_count; i++) { - owned_coords_send[3*i+0] = intermediate_node.coords.host(i,0); - owned_coords_send[3*i+1] = intermediate_node.coords.host(i,1); - owned_coords_send[3*i+2] = intermediate_node.coords.host(i,2); - } - std::vector all_owned_coords(3 * total_owned, 0.0); + // Flatten send buffer + // send_elems: flattened list of element global IDs (GIDs) that this rank is sending to all other ranks. + // For each rank r, elems_to_send[r] contains the element GIDs that should be owned by rank r after repartitioning. + std::vector send_elems; + send_elems.reserve(send_total); + for (int r = 0; r < world_size; r++) + send_elems.insert(send_elems.end(), elems_to_send[r].begin(), elems_to_send[r].end()); - // Create coordinate-specific counts and displacements (in units of doubles, not nodes) - std::vector coord_counts(world_size); - std::vector coord_displs(world_size); - for (int r = 0; r < world_size; r++) { - coord_counts[r] = 3 * owned_counts[r]; // Each node has 3 doubles - coord_displs[r] = 3 * owned_displs[r]; // Displacement in doubles + // new_elem_gids: receives the list of new element global IDs this rank will own after the exchange. + // It is filled after MPI_Alltoallv completes, and contains the GIDs for the elements new to (or remained on) this rank. + std::vector new_elem_gids(recv_total); + MPI_Alltoallv(send_elems.data(), sendcounts.data(), sdispls.data(), MPI_INT, + new_elem_gids.data(), recvcounts.data(), rdispls.data(), MPI_INT, MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); + + // New elements owned by this rank + int num_new_elems = static_cast(new_elem_gids.size()); + + if (print_info) { + std::cout << "[rank " << rank << "] new elems: " << num_new_elems << std::endl; } - MPI_Allgatherv(owned_coords_send.data(), 3*local_owned_count, MPI_DOUBLE, - all_owned_coords.data(), coord_counts.data(), coord_displs.data(), - MPI_DOUBLE, MPI_COMM_WORLD); + // -------------- Phase 3: Send element–node connectivity -------------- + int nodes_per_elem = naive_mesh.num_nodes_in_elem; - // e) Build map: gid -> coord[3] - std::unordered_map> gid_to_coord; - for (int i = 0; i < total_owned; i++) { - std::array xyz = { - all_owned_coords[3*i+0], - all_owned_coords[3*i+1], - all_owned_coords[3*i+2] - }; - gid_to_coord[all_owned_gids[i]] = xyz; - } + // Flatten element-node connectivity by global node IDs + std::vector conn_sendbuf; + for (int r = 0; r < world_size; r++) { + for (int elem_gid : elems_to_send[r]) { + // find local element lid from elem_gid + int lid = -1; + for (int i = 0; i < naive_mesh.num_elems; i++) + if (naive_mesh.local_to_global_elem_mapping.host(i) == elem_gid) { lid = i; break; } - // 4. Finally, fill final_node.coords with correct coordinates. - for (int i = 0; i < total_extended_nodes; i++) { - size_t gid = final_mesh.local_to_global_node_mapping.host(i); - auto it = gid_to_coord.find(gid); - if (it != gid_to_coord.end()) { - final_node.coords.host(i,0) = it->second[0]; - final_node.coords.host(i,1) = it->second[1]; - final_node.coords.host(i,2) = it->second[2]; - } else { - // Could happen if there's a bug: fill with zeros for safety - final_node.coords.host(i,0) = 0.0; - final_node.coords.host(i,1) = 0.0; - final_node.coords.host(i,2) = 0.0; + for (int j = 0; j < nodes_per_elem; j++) { + int node_lid = naive_mesh.nodes_in_elem.host(lid, j); + int node_gid = naive_mesh.local_to_global_node_mapping.host(node_lid); + conn_sendbuf.push_back(node_gid); + } } } - final_node.coords.update_device(); - - - // -------------------------------------------------------------------------------------- - // Build the send patterns for elements - // Build reverse map via global IDs: for each local element gid, find ranks that ghost it. - // Steps: - // 1) Each rank contributes its ghost element GIDs. - // 2) Allgatherv ghost GIDs to build gid -> [ranks that ghost it]. - // 3) For each locally-owned element gid, lookup ranks that ghost it and record targets. - // -------------------------------------------------------------------------------------- - std::vector>> boundary_elem_targets(final_mesh.num_owned_elems); - // Prepare local ghost list as vector - std::vector ghost_gids_vec; - ghost_gids_vec.reserve(final_mesh.num_ghost_elems); - for (int i = 0; i < final_mesh.num_ghost_elems; i++) { - ghost_gids_vec.push_back(final_mesh.local_to_global_elem_mapping.host(final_mesh.num_owned_elems + i)); // Ghost elements are after the owned elements in the global element mapping - } + // element-node connectivity counts (ints per dest rank) + std::vector conn_sendcounts(world_size), conn_recvcounts(world_size); + for (int r = 0; r < world_size; r++) + conn_sendcounts[r] = sendcounts[r] * nodes_per_elem; - // Exchange counts - std::vector ghost_counts(world_size, 0); - int local_ghost_count = final_mesh.num_ghost_elems; - MPI_Allgather(&local_ghost_count, 1, MPI_INT, ghost_counts.data(), 1, MPI_INT, MPI_COMM_WORLD); + MPI_Alltoall(conn_sendcounts.data(), 1, MPI_INT, conn_recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD); - // Displacements and recv buffer - std::vector ghost_displs(world_size, 0); - int total_ghosts = 0; - for (int r = 0; r < world_size; r++) { - ghost_displs[r] = total_ghosts; - total_ghosts += ghost_counts[r]; - } - std::vector all_ghost_gids(total_ghosts); - // Gather ghost gids - MPI_Allgatherv(ghost_gids_vec.data(), local_ghost_count, MPI_UNSIGNED_LONG_LONG, - all_ghost_gids.data(), ghost_counts.data(), ghost_displs.data(), - MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" Finished exchanging element–node connectivity counts"< ranks that ghost it - std::unordered_map> gid_to_ghosting_ranks; - gid_to_ghosting_ranks.reserve(static_cast(total_ghosts)); + std::vector conn_sdispls(world_size), conn_rdispls(world_size); + int conn_send_total = 0, conn_recv_total = 0; for (int r = 0; r < world_size; r++) { - int cnt = ghost_counts[r]; - int off = ghost_displs[r]; - for (int i = 0; i < cnt; i++) { - size_t g = all_ghost_gids[off + i]; - gid_to_ghosting_ranks[g].push_back(r); - } + conn_sdispls[r] = conn_send_total; + conn_rdispls[r] = conn_recv_total; + conn_send_total += conn_sendcounts[r]; + conn_recv_total += conn_recvcounts[r]; } - // For each local element, list destinations: ranks that ghost our gid - for (int elem_lid = 0; elem_lid < final_mesh.num_owned_elems; elem_lid++) { - size_t local_elem_gid = final_mesh.local_to_global_elem_mapping.host(elem_lid); - auto it = gid_to_ghosting_ranks.find(local_elem_gid); - if (it == gid_to_ghosting_ranks.end()) continue; - const std::vector &dest_ranks = it->second; - for (int rr : dest_ranks) { - if (rr == rank) continue; - boundary_elem_targets[elem_lid].push_back(std::make_pair(rr, local_elem_gid)); - } - } + std::vector conn_recvbuf(conn_recv_total); + MPI_Alltoallv(conn_sendbuf.data(), conn_sendcounts.data(), conn_sdispls.data(), MPI_INT, + conn_recvbuf.data(), conn_recvcounts.data(), conn_rdispls.data(), MPI_INT, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); - - // Add a vector to store boundary element local_ids (those who have ghost destinations across ranks) - std::vector boundary_elem_local_ids; - std::vector> boundary_to_ghost_ranks; // ragged array dimensions (num_boundary_elems, num_ghost_ranks) + if(rank == 0) std::cout<<" Finished exchanging element–node connectivity"< ghost_comm_ranks; // set of ranks that this rank communicates with - + // -------------- Phase 4: Build new node list (unique GIDs) -------------- + std::set node_gid_set(conn_recvbuf.begin(), conn_recvbuf.end()); + std::vector new_node_gids(node_gid_set.begin(), node_gid_set.end()); + int num_new_nodes = static_cast(new_node_gids.size()); - for (int elem_lid = 0; elem_lid < final_mesh.num_owned_elems; elem_lid++) { + // Build map gid→lid + std::unordered_map node_gid_to_lid; + for (int i = 0; i < num_new_nodes; i++) + node_gid_to_lid[new_node_gids[i]] = i; - int local_elem_gid = final_mesh.local_to_global_elem_mapping.host(elem_lid); - if (boundary_elem_targets[elem_lid].empty()) - { - continue; - } - else - { - // Fill in vector of boundary local_ids - boundary_elem_local_ids.push_back(elem_lid); - std::vector ghost_ranks_for_this_boundary_elem; - for (const auto &pr : boundary_elem_targets[elem_lid]) { - ghost_ranks_for_this_boundary_elem.push_back(pr.first); - ghost_comm_ranks.insert(pr.first); + if (print_info) + std::cout << "[rank " << rank << "] owns " << num_new_nodes << " unique nodes\n"; + + + // -------------- Phase 5: Request node coordinates -------------- + std::vector node_coords_sendbuf; + for (int r = 0; r < world_size; r++) { + for (int gid : elems_to_send[r]) { + int lid = -1; + for (int i = 0; i < naive_mesh.num_elems; i++) + if (naive_mesh.local_to_global_elem_mapping.host(i) == gid) { lid = i; break; } + + for (int j = 0; j < nodes_per_elem; j++) { + int node_lid = naive_mesh.nodes_in_elem.host(lid, j); + int node_gid = naive_mesh.local_to_global_node_mapping.host(node_lid); + + node_coords_sendbuf.push_back(naive_node.coords.host(node_lid, 0)); + node_coords_sendbuf.push_back(naive_node.coords.host(node_lid, 1)); + node_coords_sendbuf.push_back(naive_node.coords.host(node_lid, 2)); } - boundary_to_ghost_ranks.push_back(ghost_ranks_for_this_boundary_elem); } } - int num_ghost_comm_ranks = ghost_comm_ranks.size(); - std::vector ghost_comm_ranks_vec(num_ghost_comm_ranks); - int i = 0; - for (const auto &r : ghost_comm_ranks) { - ghost_comm_ranks_vec[i] = r; - i++; - } - + // Each node is 3 doubles; same sendcounts scaling applies + std::vector coord_sendcounts(world_size), coord_recvcounts(world_size); + for (int r = 0; r < world_size; r++) + coord_sendcounts[r] = sendcounts[r] * nodes_per_elem * 3; + MPI_Alltoall(coord_sendcounts.data(), 1, MPI_INT, coord_recvcounts.data(), 1, MPI_INT, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" Finished exchanging node coordinates counts"<(final_mesh.num_boundary_elems); - for (int i = 0; i < final_mesh.num_boundary_elems; i++) { - final_mesh.boundary_elem_local_ids.host(i) = boundary_elem_local_ids[i]; + std::vector coord_sdispls(world_size), coord_rdispls(world_size); + int coord_send_total = 0, coord_recv_total = 0; + for (int r = 0; r < world_size; r++) { + coord_sdispls[r] = coord_send_total; + coord_rdispls[r] = coord_recv_total; + coord_send_total += coord_sendcounts[r]; + coord_recv_total += coord_recvcounts[r]; } - final_mesh.boundary_elem_local_ids.update_device(); - print_info = false; + std::vector coord_recvbuf(coord_recv_total); + MPI_Alltoallv(node_coords_sendbuf.data(), coord_sendcounts.data(), coord_sdispls.data(), MPI_DOUBLE, + coord_recvbuf.data(), coord_recvcounts.data(), coord_rdispls.data(), MPI_DOUBLE, MPI_COMM_WORLD); - MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" Finished exchanging node coordinates"<(num_new_nodes); + intermediate_mesh.local_to_global_elem_mapping = DCArrayKokkos(num_new_elems); -// ****************************************************************************************** -// Create Communication Plan for element communication -// ****************************************************************************************** - + // Fill global mappings + for (int i = 0; i < num_new_nodes; i++) + intermediate_mesh.local_to_global_node_mapping.host(i) = new_node_gids[i]; + for (int i = 0; i < num_new_elems; i++) + intermediate_mesh.local_to_global_elem_mapping.host(i) = new_elem_gids[i]; - CommunicationPlan element_communication_plan; - element_communication_plan.initialize(MPI_COMM_WORLD); - // MPI_Dist_graph_create_adjacent creates a distributed graph topology communicator - // that efficiently represents the communication pattern between ranks. - // This allows MPI to optimize communication based on the actual connectivity pattern. - - - // ---------- Prepare INCOMING edges (sources) ---------- - // indegree: Number of ranks from which this rank will RECEIVE data - // These are the ranks that own elements which are ghosted on this rank - std::vector ghost_elem_receive_ranks_vec(ghost_elem_receive_ranks.begin(), - ghost_elem_receive_ranks.end()); - // The number of ranks from which this rank will receive data (incoming neighbors) - int indegree = static_cast(ghost_elem_receive_ranks_vec.size()); - - // sources: Array of source rank IDs (ranks we receive from) - // Each element corresponds to a rank that owns elements we ghost - int* sources = (indegree > 0) ? ghost_elem_receive_ranks_vec.data() : MPI_UNWEIGHTED; + intermediate_mesh.local_to_global_node_mapping.update_device(); + intermediate_mesh.local_to_global_elem_mapping.update_device(); - - // sourceweights: Weights on incoming edges (not used here, set to MPI_UNWEIGHTED) - // Could be used to specify communication volume if needed for optimization - int* sourceweights = MPI_UNWEIGHTED; - - // ---------- Prepare OUTGOING edges (destinations) ---------- - // outdegree: Number of ranks to which this rank will SEND data - // These are the ranks that ghost elements owned by this rank - int outdegree = num_ghost_comm_ranks; - - // destinations: Array of destination rank IDs (ranks we send to) - // Each element corresponds to a rank that ghosts our owned elements - int* destinations = (outdegree > 0) ? ghost_comm_ranks_vec.data() : MPI_UNWEIGHTED; - // Initialize the graph communicator for element communication - element_communication_plan.initialize_graph_communicator(outdegree, ghost_comm_ranks_vec.data(), indegree, ghost_elem_receive_ranks_vec.data()); MPI_Barrier(MPI_COMM_WORLD); - // Optional: Verify the graph communicator was created successfully - // if(print_info) element_communication_plan.verify_graph_communicator(); + if(rank == 0) std::cout<<" Starting reverse mapping of the element-node connectivity from the global node ids to the local node ids"< elem_sendcounts(element_communication_plan.num_send_ranks, 0); - // std::vector elem_sdispls(element_communication_plan.num_send_ranks, 0); - - // Count how many boundary elements go to each destination rank - // boundary_elem_targets[elem_lid] contains pairs (dest_rank, elem_gid) for each boundary element - std::map> elems_to_send_by_rank; // rank -> list of boundary element local IDs - - for (int elem_lid = 0; elem_lid < intermediate_mesh.num_elems; elem_lid++) { - if (!boundary_elem_targets[elem_lid].empty()) { - for (const auto &pr : boundary_elem_targets[elem_lid]) { - int dest_rank = pr.first; - elems_to_send_by_rank[dest_rank].push_back(elem_lid); + // Binary search through local_to_global_node_mapping to find the equivalent local index + int left = 0, right = num_new_nodes - 1; + while (left <= right) { + int mid = left + (right - left) / 2; + size_t mid_gid = intermediate_mesh.local_to_global_node_mapping.host(mid); + if (node_gid == mid_gid) { + node_lid = mid; + break; + } else if (node_gid < mid_gid) { + right = mid - 1; + } else { + left = mid + 1; + } } + intermediate_mesh.nodes_in_elem.host(i, j) = node_lid; } } - // Serialize into a DRaggedRightArrayKokkos - CArrayKokkos strides_array(element_communication_plan.num_send_ranks); - for (int i = 0; i < element_communication_plan.num_send_ranks; i++) { - int dest_rank = element_communication_plan.send_rank_ids.host(i); - strides_array(i) = elems_to_send_by_rank[dest_rank].size(); - } - DRaggedRightArrayKokkos elems_to_send_by_rank_rr(strides_array, "elems_to_send_by_rank"); + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" Finished reverse mapping of the element-node connectivity from the global node ids to the local node ids"<> node_gid_to_coords; + int coord_idx = 0; + for (int e = 0; e < intermediate_mesh.num_elems; ++e) { + for (int j = 0; j < intermediate_mesh.num_nodes_in_elem; j++) { + int node_gid = conn_recvbuf[e * intermediate_mesh.num_nodes_in_elem + j]; + if (node_gid_to_coords.find(node_gid) == node_gid_to_coords.end()) { + node_gid_to_coords[node_gid] = { + coord_recvbuf[coord_idx*3 + 0], + coord_recvbuf[coord_idx*3 + 1], + coord_recvbuf[coord_idx*3 + 2] + }; + } + coord_idx++; } } - elems_to_send_by_rank_rr.update_device(); - - - // Count how many ghost elements come from each source rank - // ghost_elem_owner_ranks[i] tells us which rank owns the i-th ghost element - std::map> elems_to_recv_by_rank; // rank -> list of ghost element indices - for (size_t i = 0; i < ghost_elem_owner_ranks.size(); i++) { - int source_rank = ghost_elem_owner_ranks[i]; - int ghost_elem_local_id = final_mesh.num_owned_elems + i; - elems_to_recv_by_rank[source_rank].push_back(ghost_elem_local_id); - } - - // ========== Serialize into a DRaggedRightArrayKokkos ========== - CArrayKokkos elem_recv_strides_array(element_communication_plan.num_recv_ranks); - for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) { - int source_rank = element_communication_plan.recv_rank_ids.host(i); - elem_recv_strides_array(i) = elems_to_recv_by_rank[source_rank].size(); - - } - DRaggedRightArrayKokkos elems_to_recv_by_rank_rr(elem_recv_strides_array, "elems_to_recv_by_rank"); - // Fill in the data - for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) { - int source_rank = element_communication_plan.recv_rank_ids.host(i); - for (int j = 0; j < elems_to_recv_by_rank[source_rank].size(); j++) { - elems_to_recv_by_rank_rr.host(i, j) = elems_to_recv_by_rank[source_rank][j]; + // Now fill coordinates in node order + intermediate_node.initialize(num_new_nodes, 3, {node_state::coords}); + for (int i = 0; i < num_new_nodes; i++) { + int node_gid = new_node_gids[i]; + auto it = node_gid_to_coords.find(node_gid); + if (it != node_gid_to_coords.end()) { + intermediate_node.coords.host(i, 0) = it->second[0]; + intermediate_node.coords.host(i, 1) = it->second[1]; + intermediate_node.coords.host(i, 2) = it->second[2]; } } - elems_to_recv_by_rank_rr.update_device(); - element_communication_plan.setup_send_recv(elems_to_send_by_rank_rr, elems_to_recv_by_rank_rr); + intermediate_node.coords.update_device(); + // Connectivity rebuild + intermediate_mesh.build_connectivity(); MPI_Barrier(MPI_COMM_WORLD); + + CommunicationPlan element_communication_plan; + element_communication_plan.initialize(MPI_COMM_WORLD); + + + build_ghost(intermediate_mesh, final_mesh, intermediate_node, final_node, element_communication_plan, world_size, rank); + + // ****************************************************************************************** // Test element communication using MPI_Neighbor_alltoallv diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp index 88727e2e..6551fd06 100644 --- a/examples/mesh_decomp/mesh_decomp.cpp +++ b/examples/mesh_decomp/mesh_decomp.cpp @@ -60,7 +60,7 @@ int main(int argc, char** argv) { // Read the mesh from a file - // read_vtk_mesh(initial_mesh, initial_node, 3, "meshes/buste.vtk"); + read_vtk_mesh(initial_mesh, initial_node, 3, "/home/jacobmoore/Desktop/repos/MATAR/meshes/impellerOpt.vtk"); double t_init_mesh_end = MPI_Wtime(); std::cout << "Initial mesh build time: " << (t_init_mesh_end - t_init_mesh_start) << " seconds" << std::endl; From 20261c972c9da67c9aed95dd9d0905fb41a68116 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Mon, 17 Nov 2025 16:08:37 -0600 Subject: [PATCH 32/52] ENH: WIP, adding nodal comms. --- examples/mesh_decomp/decomp_utils.h | 276 ++++++++++++--------------- examples/mesh_decomp/mesh_decomp.cpp | 2 +- examples/mesh_decomp/mesh_io.h | 14 +- examples/mesh_decomp/state.h | 63 +++++- src/include/communication_plan.h | 185 +++++++++--------- src/include/mpi_types.h | 2 +- 6 files changed, 286 insertions(+), 256 deletions(-) diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h index 5cdf4a6f..62986e6e 100644 --- a/examples/mesh_decomp/decomp_utils.h +++ b/examples/mesh_decomp/decomp_utils.h @@ -9,6 +9,7 @@ #include #include #include +#include #include "mesh.h" @@ -552,12 +553,40 @@ void naive_partition_mesh( return; } +/// @brief Builds ghost elements and nodes for distributed mesh decomposition. +/// +/// In distributed memory parallel computing with MPI, each rank owns a subset of the mesh. +/// Ghost elements and nodes are copies of elements/nodes from neighboring ranks that share +/// nodes with the locally-owned elements. This function identifies and extracts these ghost +/// entities to enable inter-rank communication and maintain consistency at domain boundaries. +/// +/// The algorithm operates in 5 primary steps: +/// 1. Gather element ownership information from all ranks using MPI_Allgatherv +/// 2. Collect local element-node connectivity for distribution +/// 3. Broadcast connectivity information to all ranks via MPI collective operations +/// 4. Identify which remote elements touch local elements (by shared nodes) +/// 5. Extract the full connectivity data for identified ghost elements and their nodes +/// +/// @param[in] input_mesh The locally-owned mesh on this rank containing local elements/nodes +/// @param[out] output_mesh The enriched mesh with ghost elements and nodes added to local mesh +/// @param[in] input_node Node data associated with the input mesh +/// @param[out] output_node Node data extended with ghost nodes +/// @param[in,out] element_communication_plan MPI communication plan specifying which ranks +/// exchange element data (populated by this function) +/// @param[in] world_size Total number of MPI ranks +/// @param[in] rank Current MPI rank (process ID) +/// +/// @note This is a collective MPI operation - all ranks must call this function together. +/// @note Uses data-oriented programming patterns with device-accessible arrays (MATAR containers) +/// @note Performance: O(n_local_elements * n_nodes_per_element) for local operations, +/// plus O(n_global_elements) for global MPI collective operations void build_ghost( Mesh_t& input_mesh, Mesh_t& output_mesh, node_t& input_node, node_t& output_node, CommunicationPlan& element_communication_plan, + CommunicationPlan& node_communication_plan, int world_size, int rank) { @@ -644,18 +673,18 @@ void build_ghost( } // ======================================================================== - // STEP 2: Build element-to-node connectivity for local elements + // STEP 2: Build index sets for local elements and nodes // ======================================================================== - // Ghost elements are elements from other ranks that share nodes with our - // locally-owned elements. To identify them, we need to exchange element-node - // connectivity information with all other ranks. - - // Collect all nodes that belong to our locally-owned elements - // This set will be used later to check if a remote element is relevant - std::set local_elem_nodes; + std::set local_node_gids; for(int node_rid = 0; node_rid < input_mesh.num_nodes; node_rid++) { size_t node_gid = input_mesh.local_to_global_node_mapping.host(node_rid); - local_elem_nodes.insert(node_gid); + local_node_gids.insert(node_gid); + } + + // Build a set of locally-owned element GIDs for quick lookup + std::set local_elem_gids; + for (int i = 0; i < input_mesh.num_elems; i++) { + local_elem_gids.insert(input_mesh.local_to_global_elem_mapping.host(i)); } // ======================================================================== @@ -725,15 +754,12 @@ void build_ghost( // A ghost element is an element owned by another rank that shares at least // one node with our locally-owned elements. This step identifies all such elements. - // Build a set of locally-owned element GIDs for quick lookup - std::set local_elem_gids; - for (int i = 0; i < input_mesh.num_elems; i++) { - local_elem_gids.insert(input_mesh.local_to_global_elem_mapping.host(i)); - } + + // We use a set to eliminate duplicates (same ghost element might share multiple nodes with us) + std::set ghost_elem_gids; + std::set ghost_node_gids; - // Build a temporary map: node GID -> set of element GIDs (from other ranks) that contain it - // This helps us identify which remote elements are adjacent to our local elements - std::map> node_to_ext_elem; + std::map ghost_node_recv_rank; // Iterate through connectivity data from each rank (except ourselves) for (int r = 0; r < world_size; r++) { @@ -751,51 +777,21 @@ void build_ghost( size_t node_gid = all_conn[offset + 1]; // Check if this node belongs to one of our locally-owned elements - if (local_elem_nodes.find(node_gid) != local_elem_nodes.end()) { + if (local_node_gids.find(node_gid) != local_node_gids.end()) { + ghost_node_gids.insert(node_gid); + ghost_node_recv_rank[node_gid] = r; // Check if this element is NOT owned by us (i.e., it's from another rank) if (local_elem_gids.find(elem_gid) == local_elem_gids.end()) { // This is a ghost element for us - node_to_ext_elem[node_gid].insert(elem_gid); + ghost_elem_gids.insert(elem_gid); } } } } - // Extract all unique ghost element GIDs - // We use a set to eliminate duplicates (same ghost element might share multiple nodes with us) - std::set ghost_elem_gids; - for (const auto& pair : node_to_ext_elem) { - for (size_t elem_gid : pair.second) { - ghost_elem_gids.insert(elem_gid); - } - } - - // Additional check: elements that are neighbors of our locally-owned elements - // but are owned by other ranks (these might already be in ghost_elem_gids, but check connectivity) - - // for (int lid = 0; lid < num_new_elems; lid++) { - // size_t num_neighbors = input_mesh.num_elems_in_elem(lid); - - // for (size_t nbr_idx = 0; nbr_idx < num_neighbors; ++nbr_idx) { - // size_t neighbor_lid = input_mesh.elems_in_elem(lid, nbr_idx); - - // if (neighbor_lid < static_cast(num_new_elems)) { - // size_t neighbor_gid = input_mesh.local_to_global_elem_mapping(neighbor_lid); - - // // Check if neighbor is owned by this rank - // auto it = elem_gid_to_rank.find(neighbor_gid); - // if (it != elem_gid_to_rank.end() && it->second != rank) { - // // Neighbor is owned by another rank - it's a ghost for us - // std::cout << "[rank " << rank << "] found ghost element " << neighbor_gid << std::endl; - // ghost_elem_gids.insert(neighbor_gid); - // } - // } - // } - // } - // Store the count of ghost elements for later use input_mesh.num_ghost_elems = ghost_elem_gids.size(); - + input_mesh.num_ghost_nodes = ghost_node_gids.size(); MPI_Barrier(MPI_COMM_WORLD); @@ -1025,7 +1021,6 @@ void build_ghost( output_mesh.num_ghost_elems = ghost_elem_gids.size(); output_mesh.num_ghost_nodes = ghost_only_nodes.size(); - output_mesh.num_owned_elems = input_mesh.num_elems; output_mesh.num_owned_nodes = input_mesh.num_nodes; @@ -1108,6 +1103,16 @@ void build_ghost( all_owned_gids.data(), owned_counts.data(), owned_displs.data(), MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); + // Map node gid -> owning rank + std::unordered_map node_gid_to_owner_rank; + int owner_offset = 0; + for (int r = 0; r < world_size; r++) { + for (int i = 0; i < owned_counts[r]; i++) { + node_gid_to_owner_rank[all_owned_gids[owner_offset + i]] = r; + } + owner_offset += owned_counts[r]; + } + // d) Global coords (size: total_owned x 3) std::vector owned_coords_send(3*local_owned_count, 0.0); @@ -1312,7 +1317,7 @@ void build_ghost( // Optional: Verify the graph communicator was created successfully // if(print_info) element_communication_plan.verify_graph_communicator(); - // ****************************************************************************************** +// ****************************************************************************************** // Build send counts and displacements for element communication // ****************************************************************************************** @@ -1385,6 +1390,18 @@ void build_ghost( MPI_Barrier(MPI_COMM_WORLD); + // -------------------------------------------------------------------------------------- + // Build the send pattern for nodes + // -------------------------------------------------------------------------------------- + // Build reverse map via global IDs: for each local node gid, find ranks that ghost it. + // Steps: + // 1) Each rank contributes its ghost node GIDs. + // 2) Allgatherv ghost node GIDs to build gid -> [ranks that ghost it]. + // 3) For each locally-owned node gid, lookup ranks that ghost it and record targets. + // -------------------------------------------------------------------------------------- + + + } @@ -2023,10 +2040,13 @@ void partition_mesh( CommunicationPlan element_communication_plan; element_communication_plan.initialize(MPI_COMM_WORLD); + + + CommunicationPlan node_communication_plan; + node_communication_plan.initialize(MPI_COMM_WORLD); - - build_ghost(intermediate_mesh, final_mesh, intermediate_node, final_node, element_communication_plan, world_size, rank); - + build_ghost(intermediate_mesh, final_mesh, intermediate_node, final_node, element_communication_plan, node_communication_plan, world_size, rank); + MPI_Barrier(MPI_COMM_WORLD); // ****************************************************************************************** @@ -2082,108 +2102,48 @@ void partition_mesh( - // -------------------------------------------------------------------------------------- - // TODO: Build the send pattern for nodes -------------------------------------------------------------------------------------- - // Build reverse map via global IDs: for each local node gid, find ranks that ghost it. - // Steps: - // 1) Each rank contributes its ghost node GIDs. - // 2) Allgatherv ghost node GIDs to build gid -> [ranks that ghost it]. - // 3) For each locally-owned node gid, lookup ranks that ghost it and record targets. - // -------------------------------------------------------------------------------------- - - // std::vector>> boundary_node_targets(intermediate_mesh.num_nodes); - - // // Prepare local ghost node list as vector - // std::vector ghost_node_gids_vec; - // ghost_node_gids_vec.reserve(ghost_only_nodes.size()); - // for (const auto &g : ghost_only_nodes) ghost_node_gids_vec.push_back(g); - - // // Exchange counts - // std::vector ghost_node_counts(world_size, 0); - // int local_ghost_node_count = static_cast(ghost_node_gids_vec.size()); - // MPI_Allgather(&local_ghost_node_count, 1, MPI_INT, ghost_node_counts.data(), 1, MPI_INT, MPI_COMM_WORLD); - - // // Displacements and recv buffer - // std::vector ghost_node_displs(world_size, 0); - // int total_ghost_nodes = 0; - // for (int r = 0; r < world_size; r++) { - // ghost_node_displs[r] = total_ghost_nodes; - // total_ghost_nodes += ghost_node_counts[r]; - // } - // std::vector all_ghost_node_gids(total_ghost_nodes); - - // // Gather ghost node gids - // MPI_Allgatherv(ghost_node_gids_vec.data(), local_ghost_node_count, MPI_UNSIGNED_LONG_LONG, - // all_ghost_node_gids.data(), ghost_node_counts.data(), ghost_node_displs.data(), - // MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD); - - // MPI_Barrier(MPI_COMM_WORLD); - // if(rank == 0) std::cout << " Finished gathering ghost node GIDs" << std::endl; - - - // MPI_Barrier(MPI_COMM_WORLD); - // if(rank == 0) std::cout << " Starting to build the reverse map for node communication" << std::endl; + // Test node communication using MPI_Neighbor_alltoallv + std::vector node_states = {node_state::coords, node_state::scalar_field, node_state::vector_field}; + final_node.initialize(final_mesh.num_nodes, 3, node_states, node_communication_plan); - // // Build map node_gid -> ranks that ghost it - // std::unordered_map> node_gid_to_ghosting_ranks; - // node_gid_to_ghosting_ranks.reserve(static_cast(total_ghost_nodes)); - // for (int r = 0; r < world_size; r++) { - // int cnt = ghost_node_counts[r]; - // int off = ghost_node_displs[r]; - // for (int i = 0; i < cnt; i++) { - // size_t g = all_ghost_node_gids[off + i]; - // node_gid_to_ghosting_ranks[g].push_back(r); - // } - // } - - // // For each local node, list destinations: ranks that ghost our node gid - // for (int node_lid = 0; node_lid < intermediate_mesh.num_nodes; node_lid++) { - // size_t local_node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid); - // auto it = node_gid_to_ghosting_ranks.find(local_node_gid); - // if (it == node_gid_to_ghosting_ranks.end()) continue; - // const std::vector &dest_ranks = it->second; - // for (int rr : dest_ranks) { - // if (rr == rank) continue; - // boundary_node_targets[node_lid].push_back(std::make_pair(rr, local_node_gid)); - // } - // } - - // std::cout.flush(); - // MPI_Barrier(MPI_COMM_WORLD); - // print_info = false; - - // // Optional: print a compact summary of node reverse map for verification (limited output) - // for(int i = 0; i < world_size; i++) { - // if (rank == i && print_info) { - // std::cout << std::endl; - // for (int node_lid = 0; node_lid < intermediate_mesh.num_nodes; node_lid++) { - - // size_t local_node_gid = intermediate_mesh.local_to_global_node_mapping.host(node_lid); - // if (boundary_node_targets[node_lid].empty()) - // { - // std::cout << "[rank " << rank << "] " << "node_lid: "<< node_lid <<" - node_gid: " << local_node_gid << " sends to: no ghost nodes" << std::endl; - // } - // else - // { - // std::cout << "[rank " << rank << "] " << "node_lid: "<< node_lid <<" - node_gid: " << local_node_gid << " sends to: "; - // int shown = 0; - // for (const auto &pr : boundary_node_targets[node_lid]) { - // if (shown >= 12) { std::cout << " ..."; break; } - // std::cout << "(r" << pr.first << ":gid " << pr.second << ") "; - // shown++; - // } - // std::cout << std::endl; - // } - // } - // std::cout.flush(); - // } - // MPI_Barrier(MPI_COMM_WORLD); - // } - - // print_info = false; - - // MPI_Barrier(MPI_COMM_WORLD); - // if(rank == 0) std::cout << " Finished building node communication reverse map" << std::endl; + for (int i = 0; i < final_mesh.num_owned_nodes; i++) { + final_node.scalar_field.host(i) = static_cast(rank); + final_node.vector_field.host(i, 0) = static_cast(rank); + final_node.vector_field.host(i, 1) = static_cast(rank); + final_node.vector_field.host(i, 2) = static_cast(rank); + } + for (int i = final_mesh.num_owned_nodes; i < final_mesh.num_nodes; i++) { + final_node.scalar_field.host(i) = -1.0; + final_node.vector_field.host(i, 0) = -1.0; + final_node.vector_field.host(i, 1) = -1.0; + final_node.vector_field.host(i, 2) = -1.0; + } + + final_node.coords.update_device(); + final_node.scalar_field.update_device(); + final_node.vector_field.update_device(); + + final_node.scalar_field.communicate(); + final_node.vector_field.communicate(); + MPI_Barrier(MPI_COMM_WORLD); + + + // Update scalar field to visualize the communication + + for(int elem_lid = 0; elem_lid < final_mesh.num_elems; elem_lid++) { + double value = 0.0; + for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) { + value += final_node.scalar_field.host(final_mesh.nodes_in_elem(elem_lid, j)); + } + value /= final_mesh.num_nodes_in_elem; + + for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) { + final_node.scalar_field.host(final_mesh.nodes_in_elem(elem_lid, j)) = value; + } + } + + + } diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp index 6551fd06..2113e9d6 100644 --- a/examples/mesh_decomp/mesh_decomp.cpp +++ b/examples/mesh_decomp/mesh_decomp.cpp @@ -60,7 +60,7 @@ int main(int argc, char** argv) { // Read the mesh from a file - read_vtk_mesh(initial_mesh, initial_node, 3, "/home/jacobmoore/Desktop/repos/MATAR/meshes/impellerOpt.vtk"); + // read_vtk_mesh(initial_mesh, initial_node, 3, "/home/jacobmoore/Desktop/repos/MATAR/meshes/impellerOpt.vtk"); double t_init_mesh_end = MPI_Wtime(); std::cout << "Initial mesh build time: " << (t_init_mesh_end - t_init_mesh_start) << " seconds" << std::endl; diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h index e6fc65de..c9a75a0f 100644 --- a/examples/mesh_decomp/mesh_io.h +++ b/examples/mesh_decomp/mesh_io.h @@ -543,8 +543,8 @@ void write_vtu(Mesh_t& mesh, const int num_cell_vec_vars = 1; const int num_cell_tensor_vars = 0; - const int num_point_scalar_vars = 3; - const int num_point_vec_vars = 1; + const int num_point_scalar_vars = 4; + const int num_point_vec_vars = 2; // Scalar values associated with a cell const char cell_scalar_var_names[num_cell_scalar_vars][30] = { @@ -556,11 +556,11 @@ void write_vtu(Mesh_t& mesh, }; const char point_scalar_var_names[num_point_scalar_vars][15] = { - "rank_id", "elems_in_node", "global_node_id" + "rank_id", "elems_in_node", "global_node_id", "scalar_field" }; const char point_vec_var_names[num_point_vec_vars][15] = { - "pos" + "pos", "vector_field" }; // short hand @@ -592,9 +592,15 @@ void write_vtu(Mesh_t& mesh, vec_fields(node_gid, 0, 1) = node.coords.host(node_gid, 1); vec_fields(node_gid, 0, 2) = node.coords.host(node_gid, 2); + // vector field, var 1 + vec_fields(node_gid, 1, 0) = node.vector_field.host(node_gid, 0); + vec_fields(node_gid, 1, 1) = node.vector_field.host(node_gid, 1); + vec_fields(node_gid, 1, 2) = node.vector_field.host(node_gid, 2); + point_scalar_fields(node_gid, 0) = rank; point_scalar_fields(node_gid, 1) = (double)mesh.num_corners_in_node(node_gid); point_scalar_fields(node_gid, 2) = (double)mesh.local_to_global_node_mapping.host(node_gid); + point_scalar_fields(node_gid, 3) = node.scalar_field.host(node_gid); } // File management diff --git a/examples/mesh_decomp/state.h b/examples/mesh_decomp/state.h index 0da00095..eb3d5a6b 100644 --- a/examples/mesh_decomp/state.h +++ b/examples/mesh_decomp/state.h @@ -43,7 +43,9 @@ using namespace mtr; // Possible node states, used to initialize node_t enum class node_state { - coords + coords, + scalar_field, + vector_field }; @@ -58,17 +60,68 @@ struct node_t { // Replace with MPIDCArrayKokkos - DCArrayKokkos coords; ///< Nodal coordinates - DCArrayKokkos coords_n0; ///< Nodal coordinates at tn=0 of time integration + MPICArrayKokkos coords; ///< Nodal coordinates + MPICArrayKokkos coords_n0; ///< Nodal coordinates at tn=0 of time integration + MPICArrayKokkos scalar_field; ///< Scalar field on a node + MPICArrayKokkos vector_field; ///< Vector field on a node + + // initialization method (num_nodes, num_dims, state to allocate) void initialize(size_t num_nodes, size_t num_dims, std::vector node_states) + { + + CommunicationPlan comm_plan; + + for (auto field : node_states){ + switch(field){ + case node_state::coords: + if (coords.size() == 0){ + this->coords = MPICArrayKokkos(num_nodes, num_dims, "node_coordinates"); + this->coords.initialize_comm_plan(comm_plan); + } + if (coords_n0.size() == 0){ + this->coords_n0 = MPICArrayKokkos(num_nodes, num_dims, "node_coordinates_n0"); + this->coords_n0.initialize_comm_plan(comm_plan); + } + break; + case node_state::scalar_field: + if (scalar_field.size() == 0) this->scalar_field = MPICArrayKokkos(num_nodes, "node_scalar_field"); + this->scalar_field.initialize_comm_plan(comm_plan); + break; + case node_state::vector_field: + if (vector_field.size() == 0) this->vector_field = MPICArrayKokkos(num_nodes, num_dims, "node_vector_field"); + this->vector_field.initialize_comm_plan(comm_plan); + break; + default: + std::cout<<"Desired node state not understood in node_t initialize"< node_states, CommunicationPlan& comm_plan) { for (auto field : node_states){ switch(field){ case node_state::coords: - if (coords.size() == 0) this->coords = DCArrayKokkos(num_nodes, num_dims, "node_coordinates"); - if (coords_n0.size() == 0) this->coords_n0 = DCArrayKokkos(num_nodes, num_dims, "node_coordinates_n0"); + if (coords.size() == 0){ + this->coords = MPICArrayKokkos(num_nodes, num_dims, "node_coordinates"); + this->coords.initialize_comm_plan(comm_plan); + } + if (coords_n0.size() == 0){ + this->coords_n0 = MPICArrayKokkos(num_nodes, num_dims, "node_coordinates_n0"); + this->coords_n0.initialize_comm_plan(comm_plan); + } + break; + case node_state::scalar_field: + if (scalar_field.size() == 0) this->scalar_field = MPICArrayKokkos(num_nodes, "node_scalar_field"); + this->scalar_field.initialize_comm_plan(comm_plan); + break; + case node_state::vector_field: + if (vector_field.size() == 0) this->vector_field = MPICArrayKokkos(num_nodes, num_dims, "node_vector_field"); + this->vector_field.initialize_comm_plan(comm_plan); break; default: std::cout<<"Desired node state not understood in node_t initialize"< #include "matar.h" +#include + using namespace mtr; /** @@ -19,12 +21,20 @@ using namespace mtr; * elem.density.comm() -> automatically syncs ghost elements * */ +enum class communication_plan_type { + no_communication, + all_to_all_graph +}; + + struct CommunicationPlan { // ======================================================================== // Metadata for MPI neighbor graph communication // ======================================================================== + communication_plan_type comm_type = communication_plan_type::no_communication; + // MPI world communicator MPI_Comm mpi_comm_world; bool has_comm_world = false; @@ -164,6 +174,7 @@ using namespace mtr; */ void initialize_graph_communicator(int num_send_ranks, int* send_rank_ids, int num_recv_ranks, int* recv_rank_ids){ + this->comm_type = communication_plan_type::all_to_all_graph; // Check if the MPI_COMM_WORLD communicator has been initialized. if(!has_comm_world){ throw std::runtime_error("MPI communicator for the world has not been initialized"); @@ -205,105 +216,105 @@ using namespace mtr; has_comm_graph = true; } - // void verify_graph_communicator(){ - // if(!has_comm_graph){ - // throw std::runtime_error("MPI graph communicator has not been initialized"); - // } + void verify_graph_communicator(){ + if(!has_comm_graph){ + throw std::runtime_error("MPI graph communicator has not been initialized"); + } - // // ============================================================================ - // // Verify the distributed graph communicator - // // ============================================================================ - // // Query the graph to verify it matches what we specified - // int indegree_out, outdegree_out, weighted; - // MPI_Dist_graph_neighbors_count(mpi_comm_graph, &indegree_out, &outdegree_out, &weighted); + // ============================================================================ + // Verify the distributed graph communicator + // ============================================================================ + // Query the graph to verify it matches what we specified + int indegree_out, outdegree_out, weighted; + MPI_Dist_graph_neighbors_count(mpi_comm_graph, &indegree_out, &outdegree_out, &weighted); - // // Allocate arrays to receive neighbor information - // std::vector sources_out(indegree_out); - // std::vector sourceweights_out(indegree_out); - // std::vector destinations_out(outdegree_out); - // std::vector destweights_out(outdegree_out); + // Allocate arrays to receive neighbor information + std::vector sources_out(indegree_out); + std::vector sourceweights_out(indegree_out); + std::vector destinations_out(outdegree_out); + std::vector destweights_out(outdegree_out); - // // Retrieve the actual neighbors from the graph communicator - // MPI_Dist_graph_neighbors(mpi_comm_graph, - // indegree_out, sources_out.data(), sourceweights_out.data(), - // outdegree_out, destinations_out.data(), destweights_out.data()); + // Retrieve the actual neighbors from the graph communicator + MPI_Dist_graph_neighbors(mpi_comm_graph, + indegree_out, sources_out.data(), sourceweights_out.data(), + outdegree_out, destinations_out.data(), destweights_out.data()); - // int rank = -1; - // MPI_Comm_rank(mpi_comm_world, &rank); + int rank = -1; + MPI_Comm_rank(mpi_comm_world, &rank); - // // Additional verification: Check if the queried values match our input - // bool verification_passed = true; + // Additional verification: Check if the queried values match our input + bool verification_passed = true; - // // Print verification information for each rank sequentially - // for (int r = 0; r < world_size; ++r) { - // MPI_Barrier(mpi_comm_world); - // if (rank == r) { - // std::cout << "\n[rank " << rank << "] Graph Communicator Verification:" << std::endl; - // std::cout << " Indegree (receives from " << indegree_out << " ranks): "; - // for (int i = 0; i < indegree_out; ++i) { - // std::cout << sources_out[i] << " "; - // } - // std::cout << std::endl; + // Print verification information for each rank sequentially + for (int r = 0; r < world_size; ++r) { + MPI_Barrier(mpi_comm_world); + if (rank == r) { + std::cout << "\n[rank " << rank << "] Graph Communicator Verification:" << std::endl; + std::cout << " Indegree (receives from " << indegree_out << " ranks): "; + for (int i = 0; i < indegree_out; ++i) { + std::cout << sources_out[i] << " "; + } + std::cout << std::endl; - // std::cout << " Outdegree (sends to " << outdegree_out << " ranks): "; - // for (int i = 0; i < outdegree_out; ++i) { - // std::cout << destinations_out[i] << " "; - // } - // std::cout << std::endl; + std::cout << " Outdegree (sends to " << outdegree_out << " ranks): "; + for (int i = 0; i < outdegree_out; ++i) { + std::cout << destinations_out[i] << " "; + } + std::cout << std::endl; - // std::cout << " Weighted: " << (weighted ? "yes" : "no") << std::endl; - // } - // MPI_Barrier(mpi_comm_world); - // } + std::cout << " Weighted: " << (weighted ? "yes" : "no") << std::endl; + } + MPI_Barrier(mpi_comm_world); + } - // // Check if the counts match our stored values - // if (indegree_out != num_recv_ranks) { - // std::cerr << "[rank " << rank << "] ERROR: indegree mismatch! " - // << "Expected " << num_recv_ranks << ", got " << indegree_out << std::endl; - // verification_passed = false; - // } - // if (outdegree_out != num_send_ranks) { - // std::cerr << "[rank " << rank << "] ERROR: outdegree mismatch! " - // << "Expected " << num_send_ranks << ", got " << outdegree_out << std::endl; - // verification_passed = false; - // } + // Check if the counts match our stored values + if (indegree_out != num_recv_ranks) { + std::cerr << "[rank " << rank << "] ERROR: indegree mismatch! " + << "Expected " << num_recv_ranks << ", got " << indegree_out << std::endl; + verification_passed = false; + } + if (outdegree_out != num_send_ranks) { + std::cerr << "[rank " << rank << "] ERROR: outdegree mismatch! " + << "Expected " << num_send_ranks << ", got " << outdegree_out << std::endl; + verification_passed = false; + } - // // Check if source ranks match (build set from our stored recv_rank_ids) - // std::set sources_set_in; - // for (int i = 0; i < num_recv_ranks; ++i) { - // sources_set_in.insert(recv_rank_ids.host(i)); - // } - // std::set sources_set_out(sources_out.begin(), sources_out.end()); - // if (sources_set_in != sources_set_out) { - // std::cerr << "[rank " << rank << "] ERROR: source ranks mismatch!" << std::endl; - // verification_passed = false; - // } + // Check if source ranks match (build set from our stored recv_rank_ids) + std::set sources_set_in; + for (int i = 0; i < num_recv_ranks; ++i) { + sources_set_in.insert(recv_rank_ids.host(i)); + } + std::set sources_set_out(sources_out.begin(), sources_out.end()); + if (sources_set_in != sources_set_out) { + std::cerr << "[rank " << rank << "] ERROR: source ranks mismatch!" << std::endl; + verification_passed = false; + } - // // Check if destination ranks match (build set from our stored send_rank_ids) - // std::set dests_set_in; - // for (int i = 0; i < num_send_ranks; ++i) { - // dests_set_in.insert(send_rank_ids.host(i)); - // } - // std::set dests_set_out(destinations_out.begin(), destinations_out.end()); - // if (dests_set_in != dests_set_out) { - // std::cerr << "[rank " << rank << "] ERROR: destination ranks mismatch!" << std::endl; - // verification_passed = false; - // } + // Check if destination ranks match (build set from our stored send_rank_ids) + std::set dests_set_in; + for (int i = 0; i < num_send_ranks; ++i) { + dests_set_in.insert(send_rank_ids.host(i)); + } + std::set dests_set_out(destinations_out.begin(), destinations_out.end()); + if (dests_set_in != dests_set_out) { + std::cerr << "[rank " << rank << "] ERROR: destination ranks mismatch!" << std::endl; + verification_passed = false; + } - // // Global verification check - // int local_passed = verification_passed ? 1 : 0; - // int global_passed = 0; - // MPI_Allreduce(&local_passed, &global_passed, 1, MPI_INT, MPI_MIN, mpi_comm_world); - // MPI_Barrier(mpi_comm_world); - // if (rank == 0) { - // if (global_passed) { - // std::cout << "\n✓ Graph communicator verification PASSED on all ranks\n" << std::endl; - // } else { - // std::cout << "\n✗ Graph communicator verification FAILED on one or more ranks\n" << std::endl; - // } - // } - // MPI_Barrier(mpi_comm_world); - // } + // Global verification check + int local_passed = verification_passed ? 1 : 0; + int global_passed = 0; + MPI_Allreduce(&local_passed, &global_passed, 1, MPI_INT, MPI_MIN, mpi_comm_world); + MPI_Barrier(mpi_comm_world); + if (rank == 0) { + if (global_passed) { + std::cout << "\n✓ Graph communicator verification PASSED on all ranks\n" << std::endl; + } else { + std::cout << "\n✗ Graph communicator verification FAILED on one or more ranks\n" << std::endl; + } + } + MPI_Barrier(mpi_comm_world); + } void setup_send_recv(DRaggedRightArrayKokkos &rank_send_ids, DRaggedRightArrayKokkos &rank_recv_ids){ diff --git a/src/include/mpi_types.h b/src/include/mpi_types.h index ac651551..10e58121 100644 --- a/src/include/mpi_types.h +++ b/src/include/mpi_types.h @@ -301,7 +301,7 @@ class MPICArrayKokkos { this_array_.update_host(); fill_send_buffer(); - + MPI_Neighbor_alltoallv( send_buffer_.host_pointer(), send_counts_.host_pointer(), From 9c1e9b69c76e5ae842f9873cc64d6c419acab073 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Tue, 18 Nov 2025 15:56:05 -0600 Subject: [PATCH 33/52] BUG: Debugging nodal comms, WIP --- examples/mesh_decomp/decomp_utils.h | 336 +++++++++++++++++++++++---- examples/mesh_decomp/mesh_decomp.cpp | 8 +- src/include/communication_plan.h | 150 ++++++++++++ 3 files changed, 449 insertions(+), 45 deletions(-) diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h index 62986e6e..ff697abe 100644 --- a/examples/mesh_decomp/decomp_utils.h +++ b/examples/mesh_decomp/decomp_utils.h @@ -676,9 +676,11 @@ void build_ghost( // STEP 2: Build index sets for local elements and nodes // ======================================================================== std::set local_node_gids; + std::map global_to_local_node_mapping; // GID -> local index mapping for(int node_rid = 0; node_rid < input_mesh.num_nodes; node_rid++) { size_t node_gid = input_mesh.local_to_global_node_mapping.host(node_rid); local_node_gids.insert(node_gid); + global_to_local_node_mapping[node_gid] = node_rid; } // Build a set of locally-owned element GIDs for quick lookup @@ -778,8 +780,7 @@ void build_ghost( // Check if this node belongs to one of our locally-owned elements if (local_node_gids.find(node_gid) != local_node_gids.end()) { - ghost_node_gids.insert(node_gid); - ghost_node_recv_rank[node_gid] = r; + // Check if this element is NOT owned by us (i.e., it's from another rank) if (local_elem_gids.find(elem_gid) == local_elem_gids.end()) { // This is a ghost element for us @@ -788,6 +789,122 @@ void build_ghost( } } } + MPI_Barrier(MPI_COMM_WORLD); + + + std::map> ghost_nodes_from_ranks; + + std::set shared_nodes; // nodes on MPI rank boundaries + + // Iterate through connectivity data from each rank (except ourselves) + for (int r = 0; r < world_size; r++) { + if (r == rank) continue; // Skip our own data - we already know our elements + + // Parse the connectivity data for rank r + // Data format: [elem0_gid, node0, elem0_gid, node1, ..., elem1_gid, node0, ...] + // Each pair is 2 size_ts, so num_pairs = conn_sizes[r] / 2 + int num_pairs = conn_sizes[r] / 2; + + for (int i = 0; i < num_pairs; i++) { + // Offset into all_conn for this pair (elem_gid, node_gid) + int offset = conn_displs[r] + i * 2; + size_t elem_gid = all_conn[offset]; + size_t node_gid = all_conn[offset + 1]; + + // Check if this element belongs to one of our ghost elements + if (ghost_elem_gids.find(elem_gid) != ghost_elem_gids.end()) { + + // Check if this node is NOT owned by us (i.e., it's from another rank) + if (local_node_gids.find(node_gid) == local_node_gids.end()) { + // This is a ghost node for us + ghost_node_gids.insert(node_gid); + ghost_node_recv_rank[node_gid] = r; + ghost_nodes_from_ranks[r].insert(node_gid); + } + } + } + } + + // WARNING: HERE IS THE BUG: + // When we create the send pattern for ghost nodes, we are not filtering out nodes that are on MPI rank boundaries + + // Create a vecor of the ranks that this rank will receive data from for ghost nodes + std::set ghost_node_receive_ranks; + for (const auto& pair : ghost_node_recv_rank) { + ghost_node_receive_ranks.insert(pair.second); + } + + std::vector ghost_node_receive_ranks_vec(ghost_node_receive_ranks.begin(), ghost_node_receive_ranks.end()); + + + // Print out the ghost node receive ranks for each rank sequentially + for (int r = 0; r < world_size; r++) { + if (rank == r) { + MPI_Barrier(MPI_COMM_WORLD); + std::cout << "Rank " << rank << " will receive data from the following ranks for ghost nodes: "; + for (int r : ghost_node_receive_ranks_vec) { + std::cout << r << " "; + } + std::cout << std::endl; + } + MPI_Barrier(MPI_COMM_WORLD); + } + + + // Find which nodes *we own* are ghosted on other ranks, and on which ranks + // We want: for each of our local nodes, the list of ranks that ghost it + + // Map: local_node_gid -> set of remote ranks that ghost this node + std::map> local_node_gid_to_ghosting_ranks; + + std::vector> ghosted_nodes_on_ranks(world_size); + + // Iterate through connectivity from all ranks except ourselves + for (int r = 0; r < world_size; r++) { + if (r == rank) continue; // skip our own rank + + int num_pairs = conn_sizes[r] / 2; + for (int i = 0; i < num_pairs; i++) { + int offset = conn_displs[r] + i * 2; + size_t elem_gid = all_conn[offset]; + size_t node_gid = all_conn[offset + 1]; + + // If this node is owned by us, and remote rank references it, they are ghosting it + if (local_node_gids.find(node_gid) != local_node_gids.end()) { + local_node_gid_to_ghosting_ranks[node_gid].insert(r); + ghosted_nodes_on_ranks[r].insert(node_gid); + } + } + } + + // Use the map to create a vector of the ranks that this rank will receive data from for ghost nodes + std::set ghost_node_send_ranks; + for (const auto& pair : local_node_gid_to_ghosting_ranks) { + ghost_node_send_ranks.insert(pair.second.begin(), pair.second.end()); + } + std::vector ghost_node_send_ranks_vec(ghost_node_send_ranks.begin(), ghost_node_send_ranks.end()); + + std::map> nodes_to_send_by_rank; // rank -> list of local node indices + for (int r = 0; r < world_size; r++) { + if (r == rank) continue; + for (size_t node_gid : ghosted_nodes_on_ranks[r]) { + int local_node_id = global_to_local_node_mapping[node_gid]; + nodes_to_send_by_rank[r].push_back(local_node_id); + } + } + + //print out the nodes to send by rank for each rank sequentially + for (int r = 0; r < world_size; r++) { + if (rank == r) { + std::cout << "Rank " << rank << " will send data to the following ranks for ghost nodes: "; + for (const auto& rank_node_pair : nodes_to_send_by_rank) { + std::cout << rank_node_pair.first << " "; + } + std::cout << std::endl; + } + MPI_Barrier(MPI_COMM_WORLD); + } + // Store the count of ghost elements for later use input_mesh.num_ghost_elems = ghost_elem_gids.size(); @@ -871,6 +988,7 @@ void build_ghost( } } + // Assign extended local IDs to ghost-only nodes for (size_t node_gid : ghost_only_nodes) { node_gid_to_extended_lid[node_gid] = extended_node_lid++; @@ -1055,7 +1173,8 @@ void build_ghost( // ****************************************************************************************** - output_node.initialize(total_extended_nodes, 3, {node_state::coords}); + output_node.initialize(total_extended_nodes, 3, {node_state::coords}, node_communication_plan); + MPI_Barrier(MPI_COMM_WORLD); // The goal here is to populate output_node.coords using globally gathered ghost node coordinates, // since input_node does not contain ghost node coordinates. @@ -1278,7 +1397,7 @@ void build_ghost( MPI_Barrier(MPI_COMM_WORLD); - +// Initialize graph comms for elements // MPI_Dist_graph_create_adjacent creates a distributed graph topology communicator // that efficiently represents the communication pattern between ranks. // This allows MPI to optimize communication based on the actual connectivity pattern. @@ -1290,11 +1409,11 @@ void build_ghost( std::vector ghost_elem_receive_ranks_vec(ghost_elem_receive_ranks.begin(), ghost_elem_receive_ranks.end()); // The number of ranks from which this rank will receive data (incoming neighbors) - int indegree = static_cast(ghost_elem_receive_ranks_vec.size()); + int elem_indegree = static_cast(ghost_elem_receive_ranks_vec.size()); // sources: Array of source rank IDs (ranks we receive from) // Each element corresponds to a rank that owns elements we ghost - int* sources = (indegree > 0) ? ghost_elem_receive_ranks_vec.data() : MPI_UNWEIGHTED; + int* sources = (elem_indegree > 0) ? ghost_elem_receive_ranks_vec.data() : MPI_UNWEIGHTED; // sourceweights: Weights on incoming edges (not used here, set to MPI_UNWEIGHTED) @@ -1311,12 +1430,41 @@ void build_ghost( int* destinations = (outdegree > 0) ? ghost_comm_ranks_vec.data() : MPI_UNWEIGHTED; // Initialize the graph communicator for element communication - element_communication_plan.initialize_graph_communicator(outdegree, ghost_comm_ranks_vec.data(), indegree, ghost_elem_receive_ranks_vec.data()); + element_communication_plan.initialize_graph_communicator(outdegree, ghost_comm_ranks_vec.data(), elem_indegree, ghost_elem_receive_ranks_vec.data()); MPI_Barrier(MPI_COMM_WORLD); // Optional: Verify the graph communicator was created successfully // if(print_info) element_communication_plan.verify_graph_communicator(); + +// Initialize graph comms for nodes + // ---------- Prepare INCOMING edges (sources) ---------- + // indegree: Number of ranks from which this rank will RECEIVE data + // These are the ranks that own nodes which are ghosted on this rank + int node_indegree = static_cast(ghost_node_receive_ranks.size()); + int* node_sources = (node_indegree > 0) ? ghost_node_receive_ranks_vec.data() : MPI_UNWEIGHTED; + + // sourceweights: Weights on incoming edges (not used here, set to MPI_UNWEIGHTED) + int* node_sourceweights = MPI_UNWEIGHTED; + + // ---------- Prepare OUTGOING edges (destinations) ---------- + // outdegree: Number of ranks to which this rank will SEND data + // These are the ranks that ghost nodes owned by this rank + int node_outdegree = static_cast(ghost_node_send_ranks.size()); + int* node_destinations = (node_outdegree > 0) ? ghost_node_send_ranks_vec.data() : MPI_UNWEIGHTED; + + // destinationweights: Weights on outgoing edges (not used here, set to MPI_UNWEIGHTED) + int* node_destinationweights = MPI_UNWEIGHTED; + + // Initialize the graph communicator for node communication + node_communication_plan.initialize_graph_communicator(node_outdegree, node_destinations, node_indegree, node_sources); + MPI_Barrier(MPI_COMM_WORLD); + + // Optional: Verify the graph communicator was created successfully + print_info = true; + if(print_info) node_communication_plan.verify_graph_communicator(); + print_info = false; + // ****************************************************************************************** // Build send counts and displacements for element communication // ****************************************************************************************** @@ -1399,10 +1547,112 @@ void build_ghost( // 2) Allgatherv ghost node GIDs to build gid -> [ranks that ghost it]. // 3) For each locally-owned node gid, lookup ranks that ghost it and record targets. // -------------------------------------------------------------------------------------- + + + // Print out the nodes to send by rank for each rank sequentially + for (int r = 0; r < world_size; r++) { + if (rank == r) { + std::cout << "Rank " << rank << " will send data to the following ranks for ghost nodes: "; + for (const auto& rank_node_pair : nodes_to_send_by_rank) { + std::cout << rank_node_pair.first << " "; + } + std::cout << std::endl; + } + MPI_Barrier(MPI_COMM_WORLD); + } + + + // Serialize into a DRaggedRightArrayKokkos + CArrayKokkos node_send_strides_array(node_communication_plan.num_send_ranks); + for (int i = 0; i < node_communication_plan.num_send_ranks; i++) { + int dest_rank = node_communication_plan.send_rank_ids.host(i); + node_send_strides_array(i) = nodes_to_send_by_rank[dest_rank].size(); + } + DRaggedRightArrayKokkos nodes_to_send_by_rank_rr(node_send_strides_array, "nodes_to_send_by_rank"); + + // Fill in the data + for (int i = 0; i < node_communication_plan.num_send_ranks; i++) { + int dest_rank = node_communication_plan.send_rank_ids.host(i); + for (int j = 0; j < nodes_to_send_by_rank[dest_rank].size(); j++) { + nodes_to_send_by_rank_rr.host(i, j) = nodes_to_send_by_rank[dest_rank][j]; + } + } + nodes_to_send_by_rank_rr.update_device(); + + + + // Count how many ghost nodes come from each source rank + std::map> nodes_to_recv_by_rank; // rank -> list of ghost node indices + int ghost_node_index = 0; + for (size_t ghost_node_gid : ghost_node_gids) { + int source_rank = ghost_node_recv_rank[ghost_node_gid]; + int ghost_node_local_id = output_mesh.num_owned_nodes + ghost_node_index; + nodes_to_recv_by_rank[source_rank].push_back(ghost_node_local_id); + ghost_node_index++; + } - + // Serialize into a DRaggedRightArrayKokkos + CArrayKokkos nodes_recv_strides_array(node_communication_plan.num_recv_ranks); + for (int i = 0; i < node_communication_plan.num_recv_ranks; i++) { + int source_rank = node_communication_plan.recv_rank_ids.host(i); + nodes_recv_strides_array(i) = nodes_to_recv_by_rank[source_rank].size(); + } + DRaggedRightArrayKokkos nodes_to_recv_by_rank_rr(nodes_recv_strides_array, "nodes_to_recv_by_rank"); + // Fill in the data + for (int i = 0; i < node_communication_plan.num_recv_ranks; i++) { + int source_rank = node_communication_plan.recv_rank_ids.host(i); + for (int j = 0; j < nodes_to_recv_by_rank[source_rank].size(); j++) { + size_t local_id = nodes_to_recv_by_rank[source_rank][j]; + nodes_to_recv_by_rank_rr.host(i, j) = nodes_to_recv_by_rank[source_rank][j]; + } + } + nodes_to_recv_by_rank_rr.update_device(); + + MPI_Barrier(MPI_COMM_WORLD); + + // print the nodes to send by rank rr for each rank sequentially + for (int r = 0; r < world_size; r++) { + if (rank == r) { + std::cout << "Rank " << rank << " will send nodes to the following ranks (nodes_to_send_by_rank_rr):" << std::endl; + for (int i = 0; i < node_communication_plan.num_send_ranks; i++) { + int dest_rank = node_communication_plan.send_rank_ids.host(i); + std::cout << " To rank " << dest_rank << ": ["; + for (int j = 0; j < nodes_to_send_by_rank[dest_rank].size(); j++) { + int global_node_id = output_mesh.local_to_global_node_mapping.host(nodes_to_send_by_rank[dest_rank][j]); + std::cout << global_node_id << " "; + } + std::cout << "]" << std::endl; + } + } + MPI_Barrier(MPI_COMM_WORLD); + } + + + // print the nodes to send by rank rr for each rank sequentially + for (int r = 0; r < world_size; r++) { + if (rank == r) { + std::cout << "Rank " << rank << " will receive nodes from the following ranks (nodes_to_recv_by_rank_rr):" << std::endl; + for (int i = 0; i < node_communication_plan.num_recv_ranks; i++) { + int source_rank = node_communication_plan.recv_rank_ids.host(i); + std::cout << " From rank " << source_rank << ": ["; + for (int j = 0; j < nodes_to_recv_by_rank[source_rank].size(); j++) { + int global_node_id = output_mesh.local_to_global_node_mapping.host(nodes_to_recv_by_rank[source_rank][j]); + std::cout << global_node_id << " "; + } + std::cout << "]" << std::endl; + } + } + MPI_Barrier(MPI_COMM_WORLD); + } + + + node_communication_plan.setup_send_recv(nodes_to_send_by_rank_rr, nodes_to_recv_by_rank_rr); + MPI_Barrier(MPI_COMM_WORLD); + + node_communication_plan.verify_send_recv(); + } @@ -2106,41 +2356,41 @@ void partition_mesh( std::vector node_states = {node_state::coords, node_state::scalar_field, node_state::vector_field}; final_node.initialize(final_mesh.num_nodes, 3, node_states, node_communication_plan); - for (int i = 0; i < final_mesh.num_owned_nodes; i++) { - final_node.scalar_field.host(i) = static_cast(rank); - final_node.vector_field.host(i, 0) = static_cast(rank); - final_node.vector_field.host(i, 1) = static_cast(rank); - final_node.vector_field.host(i, 2) = static_cast(rank); - } - for (int i = final_mesh.num_owned_nodes; i < final_mesh.num_nodes; i++) { - final_node.scalar_field.host(i) = -1.0; - final_node.vector_field.host(i, 0) = -1.0; - final_node.vector_field.host(i, 1) = -1.0; - final_node.vector_field.host(i, 2) = -1.0; - } - - final_node.coords.update_device(); - final_node.scalar_field.update_device(); - final_node.vector_field.update_device(); - - final_node.scalar_field.communicate(); - final_node.vector_field.communicate(); - MPI_Barrier(MPI_COMM_WORLD); - - - // Update scalar field to visualize the communication - - for(int elem_lid = 0; elem_lid < final_mesh.num_elems; elem_lid++) { - double value = 0.0; - for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) { - value += final_node.scalar_field.host(final_mesh.nodes_in_elem(elem_lid, j)); - } - value /= final_mesh.num_nodes_in_elem; - - for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) { - final_node.scalar_field.host(final_mesh.nodes_in_elem(elem_lid, j)) = value; - } - } + // for (int i = 0; i < final_mesh.num_owned_nodes; i++) { + // final_node.scalar_field.host(i) = static_cast(rank); + // final_node.vector_field.host(i, 0) = static_cast(rank); + // final_node.vector_field.host(i, 1) = static_cast(rank); + // final_node.vector_field.host(i, 2) = static_cast(rank); + // } + // for (int i = final_mesh.num_owned_nodes; i < final_mesh.num_nodes; i++) { + // final_node.scalar_field.host(i) = -1.0; + // final_node.vector_field.host(i, 0) = -1.0; + // final_node.vector_field.host(i, 1) = -1.0; + // final_node.vector_field.host(i, 2) = -1.0; + // } + + // final_node.coords.update_device(); + // final_node.scalar_field.update_device(); + // final_node.vector_field.update_device(); + + // final_node.scalar_field.communicate(); + // // final_node.vector_field.communicate(); + // MPI_Barrier(MPI_COMM_WORLD); + + + // // Update scalar field to visualize the communication + + // for(int elem_lid = 0; elem_lid < final_mesh.num_elems; elem_lid++) { + // double value = 0.0; + // for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) { + // value += final_node.scalar_field.host(final_mesh.nodes_in_elem(elem_lid, j)); + // } + // value /= final_mesh.num_nodes_in_elem; + + // for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) { + // final_node.scalar_field.host(final_mesh.nodes_in_elem(elem_lid, j)) = value; + // } + // } diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp index 2113e9d6..542628bd 100644 --- a/examples/mesh_decomp/mesh_decomp.cpp +++ b/examples/mesh_decomp/mesh_decomp.cpp @@ -33,8 +33,8 @@ int main(int argc, char** argv) { // Mesh size double origin[3] = {0.0, 0.0, 0.0}; - double length[3] = {1.0, 1.0, 1.0}; - int num_elems_dim[3] = {100, 100, 100}; + double length[3] = {1.0, 0.5, 0.5}; + int num_elems_dim[3] = {2, 1, 1}; // Initial mesh built on rank zero Mesh_t initial_mesh; @@ -58,6 +58,9 @@ int main(int argc, char** argv) { std::cout<<"Initializing mesh"< Date: Wed, 19 Nov 2025 16:57:02 -0600 Subject: [PATCH 34/52] BUG: Debugging nodal send/recv WIP, working for 2x1 mesh --- examples/mesh_decomp/decomp_utils.h | 196 ++++++++++++++++++++++++---- 1 file changed, 170 insertions(+), 26 deletions(-) diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h index ff697abe..3acf006d 100644 --- a/examples/mesh_decomp/decomp_utils.h +++ b/examples/mesh_decomp/decomp_utils.h @@ -792,9 +792,23 @@ void build_ghost( MPI_Barrier(MPI_COMM_WORLD); + // Print out the ghost elements for each rank sequentially + for (int r = 0; r < world_size; r++) { + MPI_Barrier(MPI_COMM_WORLD); + if (rank == r) { + std::cout << "Rank " << rank << " has the following ghost elements: "; + for (const auto& elem_gid : ghost_elem_gids) { + std::cout << elem_gid << " "; + } + std::cout << std::endl; + } + MPI_Barrier(MPI_COMM_WORLD); + } + + std::map> ghost_nodes_from_ranks; - std::set shared_nodes; // nodes on MPI rank boundaries + // Iterate through connectivity data from each rank (except ourselves) for (int r = 0; r < world_size; r++) { @@ -825,6 +839,63 @@ void build_ghost( } } + std::set shared_nodes; // nodes on MPI rank boundaries + // Iterate through connectivity data from each rank (except ourselves) to find shared nodes + for (int r = 0; r < world_size; r++) { + MPI_Barrier(MPI_COMM_WORLD); + if (r == rank) continue; // Skip our own data - we already know our elements + + // Parse the connectivity data for rank r + // Data format: [elem0_gid, node0, elem0_gid, node1, ..., elem1_gid, node0, ...] + // Each pair is 2 size_ts, so num_pairs = conn_sizes[r] / 2 + int num_pairs = conn_sizes[r] / 2; + + for (int i = 0; i < num_pairs; i++) { + // Offset into all_conn for this pair (elem_gid, node_gid) + int offset = conn_displs[r] + i * 2; + size_t elem_gid = all_conn[offset]; + size_t node_gid = all_conn[offset + 1]; + + // Check if this element belongs to one of our ghost elements + if (ghost_elem_gids.find(elem_gid) != ghost_elem_gids.end()) { + // If another rank references a node that is also owned by us, it is a shared node + if (local_node_gids.find(node_gid) != local_node_gids.end()) { + shared_nodes.insert(node_gid); + + } + } + } + } + + + + MPI_Barrier(MPI_COMM_WORLD); + for (int r = 0; r < world_size; r++) { + MPI_Barrier(MPI_COMM_WORLD); + if (rank == r) { + std::cout << "Rank " << rank << " has the following shared nodes: "; + for (const auto& node_gid : shared_nodes) { + std::cout << node_gid << " "; + } + std::cout << std::endl; + } + MPI_Barrier(MPI_COMM_WORLD); + } + + MPI_Barrier(MPI_COMM_WORLD); + // Print out the ghost nodes for each rank sequentially + for (int r = 0; r < world_size; r++) { + MPI_Barrier(MPI_COMM_WORLD); + if (rank == r) { + std::cout << "Rank " << rank << " has the following ghost nodes: "; + for (const auto& node_gid : ghost_node_gids) { + std::cout << node_gid << " "; + } + std::cout << std::endl; + } + MPI_Barrier(MPI_COMM_WORLD); + } + // WARNING: HERE IS THE BUG: // When we create the send pattern for ghost nodes, we are not filtering out nodes that are on MPI rank boundaries @@ -857,7 +928,7 @@ void build_ghost( // Map: local_node_gid -> set of remote ranks that ghost this node std::map> local_node_gid_to_ghosting_ranks; - std::vector> ghosted_nodes_on_ranks(world_size); + std::vector> shared_nodes_on_ranks(world_size); // Iterate through connectivity from all ranks except ourselves for (int r = 0; r < world_size; r++) { @@ -872,11 +943,13 @@ void build_ghost( // If this node is owned by us, and remote rank references it, they are ghosting it if (local_node_gids.find(node_gid) != local_node_gids.end()) { local_node_gid_to_ghosting_ranks[node_gid].insert(r); - ghosted_nodes_on_ranks[r].insert(node_gid); + shared_nodes_on_ranks[r].insert(node_gid); } } } + // WARNING: THE PREVIOUS STEP MUST INCLUDE ALL NODES AFTER MOVING GHOST NODES ONTO THIS RANK, and must be filtered to not include shared ndoes + // Use the map to create a vector of the ranks that this rank will receive data from for ghost nodes std::set ghost_node_send_ranks; for (const auto& pair : local_node_gid_to_ghosting_ranks) { @@ -884,27 +957,14 @@ void build_ghost( } std::vector ghost_node_send_ranks_vec(ghost_node_send_ranks.begin(), ghost_node_send_ranks.end()); - std::map> nodes_to_send_by_rank; // rank -> list of local node indices - for (int r = 0; r < world_size; r++) { - if (r == rank) continue; - for (size_t node_gid : ghosted_nodes_on_ranks[r]) { - int local_node_id = global_to_local_node_mapping[node_gid]; - nodes_to_send_by_rank[r].push_back(local_node_id); - } - } - - //print out the nodes to send by rank for each rank sequentially - for (int r = 0; r < world_size; r++) { - if (rank == r) { - std::cout << "Rank " << rank << " will send data to the following ranks for ghost nodes: "; - for (const auto& rank_node_pair : nodes_to_send_by_rank) { - std::cout << rank_node_pair.first << " "; - } - std::cout << std::endl; - } - MPI_Barrier(MPI_COMM_WORLD); - } - + // std::map> nodes_to_send_by_rank; // rank -> list of local node indices + // for (int r = 0; r < world_size; r++) { + // if (r == rank) continue; + // for (size_t node_gid : shared_nodes_on_ranks[r]) { + // int local_node_id = global_to_local_node_mapping[node_gid]; + // nodes_to_send_by_rank[r].push_back(local_node_id); + // } + // } // Store the count of ghost elements for later use input_mesh.num_ghost_elems = ghost_elem_gids.size(); @@ -996,6 +1056,8 @@ void build_ghost( int total_extended_nodes = extended_node_lid; + MPI_Barrier(MPI_COMM_WORLD); + // Step 3: Prepare requests for ghost node coordinates from owning ranks (if needed later) // Build request list: for each ghost node, find an owning rank via any ghost element that contains it std::map> rank_to_ghost_node_requests; @@ -1163,6 +1225,15 @@ void build_ghost( output_mesh.nodes_in_elem.update_device(); output_mesh.build_connectivity(); + + + + + + + + + MPI_Barrier(MPI_COMM_WORLD); if(rank == 0) std::cout << " Finished building final mesh structure with ghost nodes and elements" << std::endl; @@ -1397,6 +1468,77 @@ void build_ghost( MPI_Barrier(MPI_COMM_WORLD); + + std::map> node_set_to_send_by_rank; + + // For each owned element that will be ghosted on other ranks, + // collect the nodes that need to be sent to those ranks + // boundary_elem_targets[elem_lid] contains pairs (rank, elem_gid) for ranks that ghost this element + for (int elem_lid = 0; elem_lid < input_mesh.num_elems; elem_lid++) { + // Get ranks that will ghost this element + for (const auto& pair : boundary_elem_targets[elem_lid]) { + int ghosting_rank = pair.first; + + // For each node in this element + for (int j = 0; j < nodes_per_elem; j++) { + size_t node_lid = input_mesh.nodes_in_elem.host(elem_lid, j); + size_t node_gid = input_mesh.local_to_global_node_mapping.host(node_lid); + + // Only send nodes that are NOT shared (not on MPI rank boundary) + // Shared nodes are already known to both ranks + if (shared_nodes.find(node_gid) == shared_nodes.end()) { + node_set_to_send_by_rank[ghosting_rank].insert(node_gid); + } + } + } + } + + MPI_Barrier(MPI_COMM_WORLD); + + // Print out node_set_to_send_by_rank for each rank sequentially + MPI_Barrier(MPI_COMM_WORLD); + for (int r = 0; r < world_size; r++) { + MPI_Barrier(MPI_COMM_WORLD); + if (rank == r) { + std::cout << "Rank " << r << " node_set_to_send_by_rank:" << std::endl; + for (const auto& [dest_rank, node_gids] : node_set_to_send_by_rank) { + std::cout << " To rank " << dest_rank << ": ["; + for (size_t node_gid : node_gids) { + std::cout << node_gid << " "; + } + std::cout << "]" << std::endl; + } + } + MPI_Barrier(MPI_COMM_WORLD); + } + + std::map> nodes_to_send_by_rank; // rank -> list of global node indices + + // Copy the node_set_to_send_by_rank map to nodes_to_send_by_rank + for (const auto& [dest_rank, node_gids] : node_set_to_send_by_rank) { + for (size_t node_gid : node_gids) { + nodes_to_send_by_rank[dest_rank].push_back(node_gid); + } + } + + // Print out nodes_to_send_by_rank for each rank sequentially + MPI_Barrier(MPI_COMM_WORLD); + for (int r = 0; r < world_size; r++) { + MPI_Barrier(MPI_COMM_WORLD); + if (rank == r) { + std::cout << "Rank " << r << " nodes_to_send_by_rank:" << std::endl; + for (const auto& [dest_rank, node_gids] : nodes_to_send_by_rank) { + std::cout << " To rank " << dest_rank << ": ["; + for (size_t node_gid : node_gids) { + std::cout << node_gid << " "; + } + std::cout << "]" << std::endl; + } + } + MPI_Barrier(MPI_COMM_WORLD); + } + + // Initialize graph comms for elements // MPI_Dist_graph_create_adjacent creates a distributed graph topology communicator // that efficiently represents the communication pattern between ranks. @@ -1612,13 +1754,14 @@ void build_ghost( // print the nodes to send by rank rr for each rank sequentially for (int r = 0; r < world_size; r++) { + MPI_Barrier(MPI_COMM_WORLD); if (rank == r) { std::cout << "Rank " << rank << " will send nodes to the following ranks (nodes_to_send_by_rank_rr):" << std::endl; for (int i = 0; i < node_communication_plan.num_send_ranks; i++) { int dest_rank = node_communication_plan.send_rank_ids.host(i); std::cout << " To rank " << dest_rank << ": ["; for (int j = 0; j < nodes_to_send_by_rank[dest_rank].size(); j++) { - int global_node_id = output_mesh.local_to_global_node_mapping.host(nodes_to_send_by_rank[dest_rank][j]); + int global_node_id = nodes_to_send_by_rank[dest_rank][j]; std::cout << global_node_id << " "; } std::cout << "]" << std::endl; @@ -1627,7 +1770,8 @@ void build_ghost( MPI_Barrier(MPI_COMM_WORLD); } - + MPI_Barrier(MPI_COMM_WORLD); + // print the nodes to send by rank rr for each rank sequentially for (int r = 0; r < world_size; r++) { if (rank == r) { From f7e350db303c84eea536cdb5f705fb0f8b600e79 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Wed, 19 Nov 2025 17:17:07 -0600 Subject: [PATCH 35/52] ENH: Node send ids are working, now to fix recv --- examples/mesh_decomp/decomp_utils.h | 7 ++++--- examples/mesh_decomp/mesh_decomp.cpp | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h index 3acf006d..1528bf60 100644 --- a/examples/mesh_decomp/decomp_utils.h +++ b/examples/mesh_decomp/decomp_utils.h @@ -1486,7 +1486,7 @@ void build_ghost( // Only send nodes that are NOT shared (not on MPI rank boundary) // Shared nodes are already known to both ranks - if (shared_nodes.find(node_gid) == shared_nodes.end()) { + if (shared_nodes_on_ranks[ghosting_rank].find(node_gid) == shared_nodes_on_ranks[ghosting_rank].end()) { // WARNING: THIS SHOULD BE MOFIFIED TO ONLY FILTER SHARED NODES WITH THIS SPECIFIC RANK node_set_to_send_by_rank[ghosting_rank].insert(node_gid); } } @@ -1716,7 +1716,8 @@ void build_ghost( for (int i = 0; i < node_communication_plan.num_send_ranks; i++) { int dest_rank = node_communication_plan.send_rank_ids.host(i); for (int j = 0; j < nodes_to_send_by_rank[dest_rank].size(); j++) { - nodes_to_send_by_rank_rr.host(i, j) = nodes_to_send_by_rank[dest_rank][j]; + int node_gid = output_mesh.local_to_global_node_mapping.host(nodes_to_send_by_rank[dest_rank][j]); + nodes_to_send_by_rank_rr.host(i, j) = node_gid; } } nodes_to_send_by_rank_rr.update_device(); @@ -1761,7 +1762,7 @@ void build_ghost( int dest_rank = node_communication_plan.send_rank_ids.host(i); std::cout << " To rank " << dest_rank << ": ["; for (int j = 0; j < nodes_to_send_by_rank[dest_rank].size(); j++) { - int global_node_id = nodes_to_send_by_rank[dest_rank][j]; + int global_node_id = output_mesh.local_to_global_node_mapping.host(nodes_to_send_by_rank[dest_rank][j]); std::cout << global_node_id << " "; } std::cout << "]" << std::endl; diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp index 542628bd..b2f9f691 100644 --- a/examples/mesh_decomp/mesh_decomp.cpp +++ b/examples/mesh_decomp/mesh_decomp.cpp @@ -33,8 +33,8 @@ int main(int argc, char** argv) { // Mesh size double origin[3] = {0.0, 0.0, 0.0}; - double length[3] = {1.0, 0.5, 0.5}; - int num_elems_dim[3] = {2, 1, 1}; + double length[3] = {1.0, 1.0, 0.5}; + int num_elems_dim[3] = {2, 2, 1}; // Initial mesh built on rank zero Mesh_t initial_mesh; From 72e0f5b18306a2e8d8e3b61c0c5398d531ebce72 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Wed, 19 Nov 2025 17:36:16 -0600 Subject: [PATCH 36/52] ENH: send/recv match, need to be swapped to local ids --- examples/mesh_decomp/decomp_utils.h | 61 ++++++++++++++++++++++++----- 1 file changed, 51 insertions(+), 10 deletions(-) diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h index 1528bf60..717b31fe 100644 --- a/examples/mesh_decomp/decomp_utils.h +++ b/examples/mesh_decomp/decomp_utils.h @@ -1724,14 +1724,54 @@ void build_ghost( - // Count how many ghost nodes come from each source rank - std::map> nodes_to_recv_by_rank; // rank -> list of ghost node indices - int ghost_node_index = 0; - for (size_t ghost_node_gid : ghost_node_gids) { - int source_rank = ghost_node_recv_rank[ghost_node_gid]; - int ghost_node_local_id = output_mesh.num_owned_nodes + ghost_node_index; - nodes_to_recv_by_rank[source_rank].push_back(ghost_node_local_id); - ghost_node_index++; + // For each ghost element, determine which nodes need to be received from the owning rank + // Build the receive list based on ghost element nodes, not on ghost_node_gids + // This ensures we receive all nodes needed by ghost elements + std::map> node_set_to_recv_by_rank; // rank -> set of node GIDs to receive + + for (int i = 0; i < output_mesh.num_ghost_elems; i++) { + int ghost_elem_lid = output_mesh.num_owned_elems + i; + size_t ghost_elem_gid = output_mesh.local_to_global_elem_mapping.host(ghost_elem_lid); + int owning_rank = elem_gid_to_rank.at(ghost_elem_gid); + + // Collect all nodes in this ghost element + for (int j = 0; j < nodes_per_elem; j++) { + size_t node_lid = output_mesh.nodes_in_elem.host(ghost_elem_lid, j); + size_t node_gid = output_mesh.local_to_global_node_mapping.host(node_lid); + + // Only receive nodes that: + // 1. We don't own (not in local_node_gids) + // 2. Are NOT shared (not on MPI rank boundary) + // Shared nodes are already known to both ranks via element connectivity + if (local_node_gids.find(node_gid) == local_node_gids.end() && + shared_nodes_on_ranks[owning_rank].find(node_gid) == shared_nodes_on_ranks[owning_rank].end()) { + node_set_to_recv_by_rank[owning_rank].insert(node_gid); + } + } + } + + // Convert node GIDs to local indices and build nodes_to_recv_by_rank + std::map> nodes_to_recv_by_rank; // rank -> list of ghost node local indices + std::map node_gid_to_ghost_lid; // map ghost node GID to its local index in output_mesh + + // Build the GID->local index mapping for ALL ghost nodes in output_mesh + // Ghost nodes are those with local IDs >= num_owned_nodes + for (int i = output_mesh.num_owned_nodes; i < output_mesh.num_nodes; i++) { + size_t node_gid = output_mesh.local_to_global_node_mapping.host(i); + node_gid_to_ghost_lid[node_gid] = i; + } + + // Now convert the GID sets to local index vectors + for (const auto& pair : node_set_to_recv_by_rank) { + int source_rank = pair.first; + const std::set& node_gids = pair.second; + + for (size_t node_gid : node_gids) { + auto it = node_gid_to_ghost_lid.find(node_gid); + if (it != node_gid_to_ghost_lid.end()) { + nodes_to_recv_by_rank[source_rank].push_back(it->second); + } + } } // Serialize into a DRaggedRightArrayKokkos @@ -1762,7 +1802,7 @@ void build_ghost( int dest_rank = node_communication_plan.send_rank_ids.host(i); std::cout << " To rank " << dest_rank << ": ["; for (int j = 0; j < nodes_to_send_by_rank[dest_rank].size(); j++) { - int global_node_id = output_mesh.local_to_global_node_mapping.host(nodes_to_send_by_rank[dest_rank][j]); + int global_node_id = nodes_to_send_by_rank[dest_rank][j]; std::cout << global_node_id << " "; } std::cout << "]" << std::endl; @@ -1781,7 +1821,8 @@ void build_ghost( int source_rank = node_communication_plan.recv_rank_ids.host(i); std::cout << " From rank " << source_rank << ": ["; for (int j = 0; j < nodes_to_recv_by_rank[source_rank].size(); j++) { - int global_node_id = output_mesh.local_to_global_node_mapping.host(nodes_to_recv_by_rank[source_rank][j]); + int node_lid = nodes_to_recv_by_rank[source_rank][j]; + size_t global_node_id = output_mesh.local_to_global_node_mapping.host(node_lid); std::cout << global_node_id << " "; } std::cout << "]" << std::endl; From 6332734a9650aee45ddd9503cd09782b8e9880da Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Thu, 20 Nov 2025 13:35:52 -0600 Subject: [PATCH 37/52] ENH: Nodal comms working --- examples/mesh_decomp/decomp_utils.h | 459 ++++++++++++++++----------- examples/mesh_decomp/mesh_decomp.cpp | 4 +- examples/mesh_decomp/mesh_io.h | 14 +- src/include/mpi_types.h | 4 + 4 files changed, 281 insertions(+), 200 deletions(-) diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h index 717b31fe..0e7cedff 100644 --- a/examples/mesh_decomp/decomp_utils.h +++ b/examples/mesh_decomp/decomp_utils.h @@ -105,6 +105,10 @@ void naive_partition_mesh( // MPI_Scatter(const void *sendbuf, int sendcount, MPI_Datatype sendtype, // void *recvbuf, int recvcount, MPI_Datatype recvtype, // int root, MPI_Comm comm) + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" Starting the scatter operation for the element counts per rank"< required_node_state = { node_state::coords }; + + naive_node.initialize(num_nodes_on_rank, 3, required_node_state); for(int i = 0; i < num_nodes_on_rank; i++) { @@ -343,6 +360,10 @@ void naive_partition_mesh( naive_node.coords.update_device(); + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" After initializing the node state variables"< nodes_in_elem_on_rank(num_elements_on_rank * num_nodes_per_elem); double t_scatter_elemnode_start = MPI_Wtime(); - - if (rank == 0) { - // Prepare element-node connectivity data for each rank - std::vector all_nodes_in_elem; - std::vector sendcounts(world_size); - std::vector displs(world_size); + MPI_Barrier(MPI_COMM_WORLD); + // if (rank == 0) { + // // Prepare element-node connectivity data for each rank + // std::vector all_nodes_in_elem; + // std::vector sendcounts(world_size); + // std::vector displs(world_size); - int displacement = 0; - for(int i = 0; i < world_size; i++) { - int num_connectivity_entries = elements_to_send[i].size() * num_nodes_per_elem; // num_nodes_per_elem nodes per element - sendcounts[i] = num_connectivity_entries; - displs[i] = displacement; + // int displacement = 0; + // for(int i = 0; i < world_size; i++) { + // int num_connectivity_entries = elements_to_send[i].size() * num_nodes_per_elem; // num_nodes_per_elem nodes per element + // sendcounts[i] = num_connectivity_entries; + // displs[i] = displacement; - // Copy element-node connectivity for rank i - for(int j = 0; j < elements_to_send[i].size(); j++) { - for(int k = 0; k < num_nodes_per_elem; k++) { - all_nodes_in_elem.push_back(initial_mesh.nodes_in_elem.host(elements_to_send[i][j], k)); - } - } - displacement += num_connectivity_entries; - } - // Send the connectivity data to each rank - MPI_Scatterv(all_nodes_in_elem.data(), sendcounts.data(), displs.data(), MPI_INT, - nodes_in_elem_on_rank.data(), num_elements_on_rank * num_nodes_per_elem, MPI_INT, - 0, MPI_COMM_WORLD); + // // Copy element-node connectivity for rank i + // for(int j = 0; j < elements_to_send[i].size(); j++) { + // for(int k = 0; k < num_nodes_per_elem; k++) { + // all_nodes_in_elem.push_back(initial_mesh.nodes_in_elem.host(elements_to_send[i][j], k)); + // } + // } + // displacement += num_connectivity_entries; + // } + // // Send the connectivity data to each rank + // MPI_Scatterv(all_nodes_in_elem.data(), sendcounts.data(), displs.data(), MPI_INT, + // nodes_in_elem_on_rank.data(), num_elements_on_rank * num_nodes_per_elem, MPI_INT, + // 0, MPI_COMM_WORLD); + // } + // else { + // MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT, + // nodes_in_elem_on_rank.data(), num_elements_on_rank * num_nodes_per_elem, MPI_INT, + // 0, MPI_COMM_WORLD); + // } + + // MPI_Barrier(MPI_COMM_WORLD); + + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" before scattering the element-node connectivity data to each rank"< conn_sendcounts(world_size); + std::vector conn_displs(world_size); + int conn_displacement = 0; + for (int i = 0; i < world_size; i++) { + conn_sendcounts[i] = elems_per_rank[i] * num_nodes_per_elem; + conn_displs[i] = conn_displacement; + conn_displacement += conn_sendcounts[i]; } - else { - MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT, - nodes_in_elem_on_rank.data(), num_elements_on_rank * num_nodes_per_elem, MPI_INT, - 0, MPI_COMM_WORLD); + + // Scatter using the native storage type (size_t) and then convert locally to int + size_t* global_nodes_in_elem = (rank == 0) + ? initial_mesh.nodes_in_elem.host_pointer() + : nullptr; + + std::vector nodes_in_elem_on_rank_size_t(num_elements_on_rank * num_nodes_per_elem); + + MPI_Scatterv(global_nodes_in_elem, conn_sendcounts.data(), conn_displs.data(), MPI_UNSIGNED_LONG_LONG, + nodes_in_elem_on_rank_size_t.data(), nodes_in_elem_on_rank_size_t.size(), MPI_UNSIGNED_LONG_LONG, + 0, MPI_COMM_WORLD); + + for (size_t idx = 0; idx < nodes_in_elem_on_rank_size_t.size(); ++idx) { + nodes_in_elem_on_rank[idx] = static_cast(nodes_in_elem_on_rank_size_t[idx]); } MPI_Barrier(MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" After scattering the element-node connectivity data to each rank"< all_elems_in_elem; @@ -479,6 +537,9 @@ void naive_partition_mesh( MPI_Barrier(MPI_COMM_WORLD); + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" After scattering the element-element connectivity data to each rank"<> ghost_nodes_from_ranks; @@ -869,32 +930,32 @@ void build_ghost( - MPI_Barrier(MPI_COMM_WORLD); - for (int r = 0; r < world_size; r++) { - MPI_Barrier(MPI_COMM_WORLD); - if (rank == r) { - std::cout << "Rank " << rank << " has the following shared nodes: "; - for (const auto& node_gid : shared_nodes) { - std::cout << node_gid << " "; - } - std::cout << std::endl; - } - MPI_Barrier(MPI_COMM_WORLD); - } + // MPI_Barrier(MPI_COMM_WORLD); + // for (int r = 0; r < world_size; r++) { + // MPI_Barrier(MPI_COMM_WORLD); + // if (rank == r) { + // std::cout << "Rank " << rank << " has the following shared nodes: "; + // for (const auto& node_gid : shared_nodes) { + // std::cout << node_gid << " "; + // } + // std::cout << std::endl; + // } + // MPI_Barrier(MPI_COMM_WORLD); + // } - MPI_Barrier(MPI_COMM_WORLD); - // Print out the ghost nodes for each rank sequentially - for (int r = 0; r < world_size; r++) { - MPI_Barrier(MPI_COMM_WORLD); - if (rank == r) { - std::cout << "Rank " << rank << " has the following ghost nodes: "; - for (const auto& node_gid : ghost_node_gids) { - std::cout << node_gid << " "; - } - std::cout << std::endl; - } - MPI_Barrier(MPI_COMM_WORLD); - } + // MPI_Barrier(MPI_COMM_WORLD); + // // Print out the ghost nodes for each rank sequentially + // for (int r = 0; r < world_size; r++) { + // MPI_Barrier(MPI_COMM_WORLD); + // if (rank == r) { + // std::cout << "Rank " << rank << " has the following ghost nodes: "; + // for (const auto& node_gid : ghost_node_gids) { + // std::cout << node_gid << " "; + // } + // std::cout << std::endl; + // } + // MPI_Barrier(MPI_COMM_WORLD); + // } // WARNING: HERE IS THE BUG: // When we create the send pattern for ghost nodes, we are not filtering out nodes that are on MPI rank boundaries @@ -909,17 +970,17 @@ void build_ghost( // Print out the ghost node receive ranks for each rank sequentially - for (int r = 0; r < world_size; r++) { - if (rank == r) { - MPI_Barrier(MPI_COMM_WORLD); - std::cout << "Rank " << rank << " will receive data from the following ranks for ghost nodes: "; - for (int r : ghost_node_receive_ranks_vec) { - std::cout << r << " "; - } - std::cout << std::endl; - } - MPI_Barrier(MPI_COMM_WORLD); - } + // for (int r = 0; r < world_size; r++) { + // if (rank == r) { + // MPI_Barrier(MPI_COMM_WORLD); + // std::cout << "Rank " << rank << " will receive data from the following ranks for ghost nodes: "; + // for (int r : ghost_node_receive_ranks_vec) { + // std::cout << r << " "; + // } + // std::cout << std::endl; + // } + // MPI_Barrier(MPI_COMM_WORLD); + // } // Find which nodes *we own* are ghosted on other ranks, and on which ranks @@ -1496,21 +1557,21 @@ void build_ghost( MPI_Barrier(MPI_COMM_WORLD); // Print out node_set_to_send_by_rank for each rank sequentially - MPI_Barrier(MPI_COMM_WORLD); - for (int r = 0; r < world_size; r++) { - MPI_Barrier(MPI_COMM_WORLD); - if (rank == r) { - std::cout << "Rank " << r << " node_set_to_send_by_rank:" << std::endl; - for (const auto& [dest_rank, node_gids] : node_set_to_send_by_rank) { - std::cout << " To rank " << dest_rank << ": ["; - for (size_t node_gid : node_gids) { - std::cout << node_gid << " "; - } - std::cout << "]" << std::endl; - } - } - MPI_Barrier(MPI_COMM_WORLD); - } + // MPI_Barrier(MPI_COMM_WORLD); + // for (int r = 0; r < world_size; r++) { + // MPI_Barrier(MPI_COMM_WORLD); + // if (rank == r) { + // std::cout << "Rank " << r << " node_set_to_send_by_rank:" << std::endl; + // for (const auto& [dest_rank, node_gids] : node_set_to_send_by_rank) { + // std::cout << " To rank " << dest_rank << ": ["; + // for (size_t node_gid : node_gids) { + // std::cout << node_gid << " "; + // } + // std::cout << "]" << std::endl; + // } + // } + // MPI_Barrier(MPI_COMM_WORLD); + // } std::map> nodes_to_send_by_rank; // rank -> list of global node indices @@ -1521,25 +1582,25 @@ void build_ghost( } } - // Print out nodes_to_send_by_rank for each rank sequentially - MPI_Barrier(MPI_COMM_WORLD); - for (int r = 0; r < world_size; r++) { - MPI_Barrier(MPI_COMM_WORLD); - if (rank == r) { - std::cout << "Rank " << r << " nodes_to_send_by_rank:" << std::endl; - for (const auto& [dest_rank, node_gids] : nodes_to_send_by_rank) { - std::cout << " To rank " << dest_rank << ": ["; - for (size_t node_gid : node_gids) { - std::cout << node_gid << " "; - } - std::cout << "]" << std::endl; - } - } - MPI_Barrier(MPI_COMM_WORLD); - } + // // Print out nodes_to_send_by_rank for each rank sequentially + // MPI_Barrier(MPI_COMM_WORLD); + // for (int r = 0; r < world_size; r++) { + // MPI_Barrier(MPI_COMM_WORLD); + // if (rank == r) { + // std::cout << "Rank " << r << " nodes_to_send_by_rank:" << std::endl; + // for (const auto& [dest_rank, node_gids] : nodes_to_send_by_rank) { + // std::cout << " To rank " << dest_rank << ": ["; + // for (size_t node_gid : node_gids) { + // std::cout << node_gid << " "; + // } + // std::cout << "]" << std::endl; + // } + // } + // MPI_Barrier(MPI_COMM_WORLD); + // } -// Initialize graph comms for elements + // Initialize graph comms for elements // MPI_Dist_graph_create_adjacent creates a distributed graph topology communicator // that efficiently represents the communication pattern between ranks. // This allows MPI to optimize communication based on the actual connectivity pattern. @@ -1579,7 +1640,7 @@ void build_ghost( // if(print_info) element_communication_plan.verify_graph_communicator(); -// Initialize graph comms for nodes + // Initialize graph comms for nodes // ---------- Prepare INCOMING edges (sources) ---------- // indegree: Number of ranks from which this rank will RECEIVE data // These are the ranks that own nodes which are ghosted on this rank @@ -1603,15 +1664,15 @@ void build_ghost( MPI_Barrier(MPI_COMM_WORLD); // Optional: Verify the graph communicator was created successfully - print_info = true; - if(print_info) node_communication_plan.verify_graph_communicator(); - print_info = false; + // print_info = true; + // if(print_info) node_communication_plan.verify_graph_communicator(); + // print_info = false; -// ****************************************************************************************** -// Build send counts and displacements for element communication -// ****************************************************************************************** + // ****************************************************************************************** + // Build send counts and displacements for element communication + // ****************************************************************************************** - // ========== Build send counts and displacements for OUTGOING neighbors (destinations) ========== + // ========== Build send counts and displacements for OUTGOING neighbors (destinations) ========== // For MPI_Neighbor_alltoallv with graph communicator: // - elem_sendcounts[i] = number of elements to send to i-th outgoing neighbor (destinations_out[i]) // - elem_sdispls[i] = starting position in send buffer for i-th outgoing neighbor @@ -1692,16 +1753,16 @@ void build_ghost( // Print out the nodes to send by rank for each rank sequentially - for (int r = 0; r < world_size; r++) { - if (rank == r) { - std::cout << "Rank " << rank << " will send data to the following ranks for ghost nodes: "; - for (const auto& rank_node_pair : nodes_to_send_by_rank) { - std::cout << rank_node_pair.first << " "; - } - std::cout << std::endl; - } - MPI_Barrier(MPI_COMM_WORLD); - } + // for (int r = 0; r < world_size; r++) { + // if (rank == r) { + // std::cout << "Rank " << rank << " will send data to the following ranks for ghost nodes: "; + // for (const auto& rank_node_pair : nodes_to_send_by_rank) { + // std::cout << rank_node_pair.first << " "; + // } + // std::cout << std::endl; + // } + // MPI_Barrier(MPI_COMM_WORLD); + // } // Serialize into a DRaggedRightArrayKokkos @@ -1716,8 +1777,9 @@ void build_ghost( for (int i = 0; i < node_communication_plan.num_send_ranks; i++) { int dest_rank = node_communication_plan.send_rank_ids.host(i); for (int j = 0; j < nodes_to_send_by_rank[dest_rank].size(); j++) { - int node_gid = output_mesh.local_to_global_node_mapping.host(nodes_to_send_by_rank[dest_rank][j]); - nodes_to_send_by_rank_rr.host(i, j) = node_gid; + int node_gid = nodes_to_send_by_rank[dest_rank][j]; + int node_lid = node_gid_to_extended_lid[node_gid]; + nodes_to_send_by_rank_rr.host(i, j) = node_lid; } } nodes_to_send_by_rank_rr.update_device(); @@ -1785,7 +1847,9 @@ void build_ghost( for (int i = 0; i < node_communication_plan.num_recv_ranks; i++) { int source_rank = node_communication_plan.recv_rank_ids.host(i); for (int j = 0; j < nodes_to_recv_by_rank[source_rank].size(); j++) { - size_t local_id = nodes_to_recv_by_rank[source_rank][j]; + size_t node_gid = nodes_to_recv_by_rank[source_rank][j]; + size_t local_id = node_gid_to_extended_lid[node_gid]; + nodes_to_recv_by_rank_rr.host(i, j) = nodes_to_recv_by_rank[source_rank][j]; } } @@ -1793,43 +1857,43 @@ void build_ghost( MPI_Barrier(MPI_COMM_WORLD); - // print the nodes to send by rank rr for each rank sequentially - for (int r = 0; r < world_size; r++) { - MPI_Barrier(MPI_COMM_WORLD); - if (rank == r) { - std::cout << "Rank " << rank << " will send nodes to the following ranks (nodes_to_send_by_rank_rr):" << std::endl; - for (int i = 0; i < node_communication_plan.num_send_ranks; i++) { - int dest_rank = node_communication_plan.send_rank_ids.host(i); - std::cout << " To rank " << dest_rank << ": ["; - for (int j = 0; j < nodes_to_send_by_rank[dest_rank].size(); j++) { - int global_node_id = nodes_to_send_by_rank[dest_rank][j]; - std::cout << global_node_id << " "; - } - std::cout << "]" << std::endl; - } - } - MPI_Barrier(MPI_COMM_WORLD); - } + // // print the nodes to send by rank rr for each rank sequentially + // for (int r = 0; r < world_size; r++) { + // MPI_Barrier(MPI_COMM_WORLD); + // if (rank == r) { + // std::cout << "Rank " << rank << " will send nodes to the following ranks (nodes_to_send_by_rank_rr):" << std::endl; + // for (int i = 0; i < node_communication_plan.num_send_ranks; i++) { + // int dest_rank = node_communication_plan.send_rank_ids.host(i); + // std::cout << " To rank " << dest_rank << ": ["; + // for (int j = 0; j < nodes_to_send_by_rank[dest_rank].size(); j++) { + // int global_node_id = nodes_to_send_by_rank[dest_rank][j]; + // std::cout << global_node_id << " "; + // } + // std::cout << "]" << std::endl; + // } + // } + // MPI_Barrier(MPI_COMM_WORLD); + // } - MPI_Barrier(MPI_COMM_WORLD); + // MPI_Barrier(MPI_COMM_WORLD); - // print the nodes to send by rank rr for each rank sequentially - for (int r = 0; r < world_size; r++) { - if (rank == r) { - std::cout << "Rank " << rank << " will receive nodes from the following ranks (nodes_to_recv_by_rank_rr):" << std::endl; - for (int i = 0; i < node_communication_plan.num_recv_ranks; i++) { - int source_rank = node_communication_plan.recv_rank_ids.host(i); - std::cout << " From rank " << source_rank << ": ["; - for (int j = 0; j < nodes_to_recv_by_rank[source_rank].size(); j++) { - int node_lid = nodes_to_recv_by_rank[source_rank][j]; - size_t global_node_id = output_mesh.local_to_global_node_mapping.host(node_lid); - std::cout << global_node_id << " "; - } - std::cout << "]" << std::endl; - } - } - MPI_Barrier(MPI_COMM_WORLD); - } + // // print the nodes to send by rank rr for each rank sequentially + // for (int r = 0; r < world_size; r++) { + // if (rank == r) { + // std::cout << "Rank " << rank << " will receive nodes from the following ranks (nodes_to_recv_by_rank_rr):" << std::endl; + // for (int i = 0; i < node_communication_plan.num_recv_ranks; i++) { + // int source_rank = node_communication_plan.recv_rank_ids.host(i); + // std::cout << " From rank " << source_rank << ": ["; + // for (int j = 0; j < nodes_to_recv_by_rank[source_rank].size(); j++) { + // int node_lid = nodes_to_recv_by_rank[source_rank][j]; + // size_t global_node_id = output_mesh.local_to_global_node_mapping.host(node_lid); + // std::cout << global_node_id << " "; + // } + // std::cout << "]" << std::endl; + // } + // } + // MPI_Barrier(MPI_COMM_WORLD); + // } @@ -1837,7 +1901,7 @@ void build_ghost( node_communication_plan.setup_send_recv(nodes_to_send_by_rank_rr, nodes_to_recv_by_rank_rr); MPI_Barrier(MPI_COMM_WORLD); - node_communication_plan.verify_send_recv(); + // node_communication_plan.verify_send_recv(); } @@ -1897,8 +1961,12 @@ void partition_mesh( // Perform the naive partitioning of the mesh - naive_partition_mesh(initial_mesh, initial_node, naive_mesh, naive_node, elems_in_elem_on_rank, num_elems_in_elem_per_rank, world_size, rank); + MPI_Barrier(MPI_COMM_WORLD); + if (rank == 0) std::cout << "Performing the naive partitioning of the mesh" << std::endl; + naive_partition_mesh(initial_mesh, initial_node, naive_mesh, naive_node, elems_in_elem_on_rank, num_elems_in_elem_per_rank, world_size, rank); + MPI_Barrier(MPI_COMM_WORLD); + if (rank == 0) std::cout << "Naive partitioning of the mesh completed" << std::endl; /********************************************************************************** * Build PT-Scotch distributed graph representation of the mesh for repartitioning * @@ -2481,9 +2549,12 @@ void partition_mesh( CommunicationPlan node_communication_plan; node_communication_plan.initialize(MPI_COMM_WORLD); - build_ghost(intermediate_mesh, final_mesh, intermediate_node, final_node, element_communication_plan, node_communication_plan, world_size, rank); MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<" Starting the ghost element and node construction"< node_states = {node_state::coords, node_state::scalar_field, node_state::vector_field}; final_node.initialize(final_mesh.num_nodes, 3, node_states, node_communication_plan); - // for (int i = 0; i < final_mesh.num_owned_nodes; i++) { - // final_node.scalar_field.host(i) = static_cast(rank); - // final_node.vector_field.host(i, 0) = static_cast(rank); - // final_node.vector_field.host(i, 1) = static_cast(rank); - // final_node.vector_field.host(i, 2) = static_cast(rank); - // } - // for (int i = final_mesh.num_owned_nodes; i < final_mesh.num_nodes; i++) { - // final_node.scalar_field.host(i) = -1.0; - // final_node.vector_field.host(i, 0) = -1.0; - // final_node.vector_field.host(i, 1) = -1.0; - // final_node.vector_field.host(i, 2) = -1.0; - // } + for (int i = 0; i < final_mesh.num_owned_nodes; i++) { + final_node.scalar_field.host(i) = static_cast(rank); + final_node.vector_field.host(i, 0) = static_cast(rank); + final_node.vector_field.host(i, 1) = static_cast(rank); + final_node.vector_field.host(i, 2) = static_cast(rank); + } + for (int i = final_mesh.num_owned_nodes; i < final_mesh.num_nodes; i++) { + final_node.scalar_field.host(i) = -1.0; + final_node.vector_field.host(i, 0) = -1.0; + final_node.vector_field.host(i, 1) = -1.0; + final_node.vector_field.host(i, 2) = -1.0; + } - // final_node.coords.update_device(); - // final_node.scalar_field.update_device(); - // final_node.vector_field.update_device(); + final_node.coords.update_device(); + final_node.scalar_field.update_device(); + final_node.vector_field.update_device(); - // final_node.scalar_field.communicate(); - // // final_node.vector_field.communicate(); - // MPI_Barrier(MPI_COMM_WORLD); + final_node.scalar_field.communicate(); + // final_node.vector_field.communicate(); + MPI_Barrier(MPI_COMM_WORLD); - // // Update scalar field to visualize the communication + // Update scalar field to visualize the communication - // for(int elem_lid = 0; elem_lid < final_mesh.num_elems; elem_lid++) { - // double value = 0.0; - // for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) { - // value += final_node.scalar_field.host(final_mesh.nodes_in_elem(elem_lid, j)); - // } - // value /= final_mesh.num_nodes_in_elem; + for(int elem_lid = 0; elem_lid < final_mesh.num_elems; elem_lid++) { + double value = 0.0; + for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) { + value += final_node.scalar_field.host(final_mesh.nodes_in_elem(elem_lid, j)); + } + value /= final_mesh.num_nodes_in_elem; - // for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) { - // final_node.scalar_field.host(final_mesh.nodes_in_elem(elem_lid, j)) = value; - // } - // } + for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) { + final_node.scalar_field.host(final_mesh.nodes_in_elem(elem_lid, j)) = value; + } + } diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp index b2f9f691..5738b21f 100644 --- a/examples/mesh_decomp/mesh_decomp.cpp +++ b/examples/mesh_decomp/mesh_decomp.cpp @@ -33,8 +33,8 @@ int main(int argc, char** argv) { // Mesh size double origin[3] = {0.0, 0.0, 0.0}; - double length[3] = {1.0, 1.0, 0.5}; - int num_elems_dim[3] = {2, 2, 1}; + double length[3] = {1.0, 1.0, 1.0}; + int num_elems_dim[3] = {100, 100, 100}; // Initial mesh built on rank zero Mesh_t initial_mesh; diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h index c9a75a0f..00f79fb2 100644 --- a/examples/mesh_decomp/mesh_io.h +++ b/examples/mesh_decomp/mesh_io.h @@ -287,8 +287,8 @@ void build_3d_box( const int num_cell_vec_vars = 0; const int num_cell_tensor_vars = 0; - const int num_point_scalar_vars = 2; - const int num_point_vec_vars = 1; + const int num_point_scalar_vars = 3; + const int num_point_vec_vars = 2; // Scalar values associated with a cell @@ -301,11 +301,11 @@ void build_3d_box( // }; const char point_scalar_var_names[num_point_scalar_vars][15] = { - "rank_id", "elems_in_node" + "rank_id", "elems_in_node", "scalar_field" }; const char point_vec_var_names[num_point_vec_vars][15] = { - "pos" + "pos", "vector_field" }; // short hand @@ -341,8 +341,14 @@ void build_3d_box( vec_fields(node_gid, 0, 1) = node.coords.host(node_gid, 1); vec_fields(node_gid, 0, 2) = node.coords.host(node_gid, 2); + // vector field, var 1 + vec_fields(node_gid, 1, 0) = node.vector_field.host(node_gid, 0); + vec_fields(node_gid, 1, 1) = node.vector_field.host(node_gid, 1); + vec_fields(node_gid, 1, 2) = node.vector_field.host(node_gid, 2); + point_scalar_fields(node_gid, 0) = rank; point_scalar_fields(node_gid, 1) = (double)mesh.num_corners_in_node(node_gid); + point_scalar_fields(node_gid, 2) = node.scalar_field.host(node_gid); if(node_gid == 0) { std::cout << "*******[rank " << rank << "] - num_corners_in_node: " << mesh.num_corners_in_node(node_gid) << std::endl; diff --git a/src/include/mpi_types.h b/src/include/mpi_types.h index 10e58121..b0999049 100644 --- a/src/include/mpi_types.h +++ b/src/include/mpi_types.h @@ -175,6 +175,10 @@ class MPICArrayKokkos { // Method to set comm plan for halo communication void initialize_comm_plan(CommunicationPlan& comm_plan){ comm_plan_ = &comm_plan; + + if(comm_plan_->comm_type == communication_plan_type::no_communication){ + return; + } size_t send_size = comm_plan_->total_send_count * stride_; size_t recv_size = comm_plan_->total_recv_count * stride_; From 524e95005659021329493953baf0ce23a458afe4 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Fri, 21 Nov 2025 11:09:59 -0600 Subject: [PATCH 38/52] STYLE: Tidying up --- examples/mesh_decomp/decomp_utils.h | 298 +--------------------------- 1 file changed, 11 insertions(+), 287 deletions(-) diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h index 0e7cedff..ac1fd6e9 100644 --- a/examples/mesh_decomp/decomp_utils.h +++ b/examples/mesh_decomp/decomp_utils.h @@ -240,9 +240,6 @@ void naive_partition_mesh( // ****************************************************************************************** // Scatter the actual node global ids to each rank // ****************************************************************************************** - // Timer: Start measuring time for scattering node global ids - double t_scatter_nodeids_start = MPI_Wtime(); - if (rank == 0) { // Prepare data for MPI_Scatterv (scatter with variable counts) @@ -283,18 +280,12 @@ void naive_partition_mesh( MPI_Barrier(MPI_COMM_WORLD); - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout<<" After scattering the node global ids to each rank"< node_pos_on_rank_flat(num_nodes_on_rank * 3); - // Timer for scattering node positions - double t_scatter_nodepos_start = MPI_Wtime(); - if(rank == 0) { for (int i = 0; i < world_size; i++) { @@ -338,9 +329,6 @@ void naive_partition_mesh( MPI_Barrier(MPI_COMM_WORLD); - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout<<" After scattering the node positions to each rank"< required_node_state = { node_state::coords }; - naive_node.initialize(num_nodes_on_rank, 3, required_node_state); for(int i = 0; i < num_nodes_on_rank; i++) { @@ -359,55 +346,16 @@ void naive_partition_mesh( naive_node.coords.update_device(); - - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout<<" After initializing the node state variables"< nodes_in_elem_on_rank(num_elements_on_rank * num_nodes_per_elem); - - double t_scatter_elemnode_start = MPI_Wtime(); - MPI_Barrier(MPI_COMM_WORLD); - // if (rank == 0) { - // // Prepare element-node connectivity data for each rank - // std::vector all_nodes_in_elem; - // std::vector sendcounts(world_size); - // std::vector displs(world_size); - - // int displacement = 0; - // for(int i = 0; i < world_size; i++) { - // int num_connectivity_entries = elements_to_send[i].size() * num_nodes_per_elem; // num_nodes_per_elem nodes per element - // sendcounts[i] = num_connectivity_entries; - // displs[i] = displacement; - - // // Copy element-node connectivity for rank i - // for(int j = 0; j < elements_to_send[i].size(); j++) { - // for(int k = 0; k < num_nodes_per_elem; k++) { - // all_nodes_in_elem.push_back(initial_mesh.nodes_in_elem.host(elements_to_send[i][j], k)); - // } - // } - // displacement += num_connectivity_entries; - // } - // // Send the connectivity data to each rank - // MPI_Scatterv(all_nodes_in_elem.data(), sendcounts.data(), displs.data(), MPI_INT, - // nodes_in_elem_on_rank.data(), num_elements_on_rank * num_nodes_per_elem, MPI_INT, - // 0, MPI_COMM_WORLD); - // } - // else { - // MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT, - // nodes_in_elem_on_rank.data(), num_elements_on_rank * num_nodes_per_elem, MPI_INT, - // 0, MPI_COMM_WORLD); - // } - - // MPI_Barrier(MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout<<" before scattering the element-node connectivity data to each rank"< conn_sendcounts(world_size); @@ -420,9 +368,11 @@ void naive_partition_mesh( } // Scatter using the native storage type (size_t) and then convert locally to int - size_t* global_nodes_in_elem = (rank == 0) - ? initial_mesh.nodes_in_elem.host_pointer() - : nullptr; + size_t* global_nodes_in_elem = nullptr; + if (rank == 0) { + global_nodes_in_elem = initial_mesh.nodes_in_elem.host_pointer(); + } + MPI_Barrier(MPI_COMM_WORLD); std::vector nodes_in_elem_on_rank_size_t(num_elements_on_rank * num_nodes_per_elem); @@ -434,12 +384,6 @@ void naive_partition_mesh( nodes_in_elem_on_rank[idx] = static_cast(nodes_in_elem_on_rank_size_t[idx]); } - MPI_Barrier(MPI_COMM_WORLD); - - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout<<" After scattering the element-node connectivity data to each rank"<> ghost_nodes_from_ranks; - - // Iterate through connectivity data from each rank (except ourselves) for (int r = 0; r < world_size; r++) { if (r == rank) continue; // Skip our own data - we already know our elements @@ -928,38 +845,6 @@ void build_ghost( } } - - - // MPI_Barrier(MPI_COMM_WORLD); - // for (int r = 0; r < world_size; r++) { - // MPI_Barrier(MPI_COMM_WORLD); - // if (rank == r) { - // std::cout << "Rank " << rank << " has the following shared nodes: "; - // for (const auto& node_gid : shared_nodes) { - // std::cout << node_gid << " "; - // } - // std::cout << std::endl; - // } - // MPI_Barrier(MPI_COMM_WORLD); - // } - - // MPI_Barrier(MPI_COMM_WORLD); - // // Print out the ghost nodes for each rank sequentially - // for (int r = 0; r < world_size; r++) { - // MPI_Barrier(MPI_COMM_WORLD); - // if (rank == r) { - // std::cout << "Rank " << rank << " has the following ghost nodes: "; - // for (const auto& node_gid : ghost_node_gids) { - // std::cout << node_gid << " "; - // } - // std::cout << std::endl; - // } - // MPI_Barrier(MPI_COMM_WORLD); - // } - - // WARNING: HERE IS THE BUG: - // When we create the send pattern for ghost nodes, we are not filtering out nodes that are on MPI rank boundaries - // Create a vecor of the ranks that this rank will receive data from for ghost nodes std::set ghost_node_receive_ranks; for (const auto& pair : ghost_node_recv_rank) { @@ -969,20 +854,6 @@ void build_ghost( std::vector ghost_node_receive_ranks_vec(ghost_node_receive_ranks.begin(), ghost_node_receive_ranks.end()); - // Print out the ghost node receive ranks for each rank sequentially - // for (int r = 0; r < world_size; r++) { - // if (rank == r) { - // MPI_Barrier(MPI_COMM_WORLD); - // std::cout << "Rank " << rank << " will receive data from the following ranks for ghost nodes: "; - // for (int r : ghost_node_receive_ranks_vec) { - // std::cout << r << " "; - // } - // std::cout << std::endl; - // } - // MPI_Barrier(MPI_COMM_WORLD); - // } - - // Find which nodes *we own* are ghosted on other ranks, and on which ranks // We want: for each of our local nodes, the list of ranks that ghost it @@ -1009,8 +880,6 @@ void build_ghost( } } - // WARNING: THE PREVIOUS STEP MUST INCLUDE ALL NODES AFTER MOVING GHOST NODES ONTO THIS RANK, and must be filtered to not include shared ndoes - // Use the map to create a vector of the ranks that this rank will receive data from for ghost nodes std::set ghost_node_send_ranks; for (const auto& pair : local_node_gid_to_ghosting_ranks) { @@ -1018,15 +887,6 @@ void build_ghost( } std::vector ghost_node_send_ranks_vec(ghost_node_send_ranks.begin(), ghost_node_send_ranks.end()); - // std::map> nodes_to_send_by_rank; // rank -> list of local node indices - // for (int r = 0; r < world_size; r++) { - // if (r == rank) continue; - // for (size_t node_gid : shared_nodes_on_ranks[r]) { - // int local_node_id = global_to_local_node_mapping[node_gid]; - // nodes_to_send_by_rank[r].push_back(local_node_id); - // } - // } - // Store the count of ghost elements for later use input_mesh.num_ghost_elems = ghost_elem_gids.size(); input_mesh.num_ghost_nodes = ghost_node_gids.size(); @@ -1109,7 +969,6 @@ void build_ghost( } } - // Assign extended local IDs to ghost-only nodes for (size_t node_gid : ghost_only_nodes) { node_gid_to_extended_lid[node_gid] = extended_node_lid++; @@ -1276,25 +1135,9 @@ void build_ghost( MPI_Barrier(MPI_COMM_WORLD); - double t_ghost_end = MPI_Wtime(); - - if (rank == 0) { - std::cout << " Finished calculating ghost elements" << std::endl; - std::cout << " Ghost element calculation took " << (t_ghost_end - t_ghost_start) << " seconds." << std::endl; - } - output_mesh.nodes_in_elem.update_device(); output_mesh.build_connectivity(); - - - - - - - - - MPI_Barrier(MPI_COMM_WORLD); if(rank == 0) std::cout << " Finished building final mesh structure with ghost nodes and elements" << std::endl; @@ -1528,8 +1371,6 @@ void build_ghost( MPI_Barrier(MPI_COMM_WORLD); - - std::map> node_set_to_send_by_rank; // For each owned element that will be ghosted on other ranks, @@ -1556,23 +1397,6 @@ void build_ghost( MPI_Barrier(MPI_COMM_WORLD); - // Print out node_set_to_send_by_rank for each rank sequentially - // MPI_Barrier(MPI_COMM_WORLD); - // for (int r = 0; r < world_size; r++) { - // MPI_Barrier(MPI_COMM_WORLD); - // if (rank == r) { - // std::cout << "Rank " << r << " node_set_to_send_by_rank:" << std::endl; - // for (const auto& [dest_rank, node_gids] : node_set_to_send_by_rank) { - // std::cout << " To rank " << dest_rank << ": ["; - // for (size_t node_gid : node_gids) { - // std::cout << node_gid << " "; - // } - // std::cout << "]" << std::endl; - // } - // } - // MPI_Barrier(MPI_COMM_WORLD); - // } - std::map> nodes_to_send_by_rank; // rank -> list of global node indices // Copy the node_set_to_send_by_rank map to nodes_to_send_by_rank @@ -1582,24 +1406,6 @@ void build_ghost( } } - // // Print out nodes_to_send_by_rank for each rank sequentially - // MPI_Barrier(MPI_COMM_WORLD); - // for (int r = 0; r < world_size; r++) { - // MPI_Barrier(MPI_COMM_WORLD); - // if (rank == r) { - // std::cout << "Rank " << r << " nodes_to_send_by_rank:" << std::endl; - // for (const auto& [dest_rank, node_gids] : nodes_to_send_by_rank) { - // std::cout << " To rank " << dest_rank << ": ["; - // for (size_t node_gid : node_gids) { - // std::cout << node_gid << " "; - // } - // std::cout << "]" << std::endl; - // } - // } - // MPI_Barrier(MPI_COMM_WORLD); - // } - - // Initialize graph comms for elements // MPI_Dist_graph_create_adjacent creates a distributed graph topology communicator // that efficiently represents the communication pattern between ranks. @@ -1663,11 +1469,6 @@ void build_ghost( node_communication_plan.initialize_graph_communicator(node_outdegree, node_destinations, node_indegree, node_sources); MPI_Barrier(MPI_COMM_WORLD); - // Optional: Verify the graph communicator was created successfully - // print_info = true; - // if(print_info) node_communication_plan.verify_graph_communicator(); - // print_info = false; - // ****************************************************************************************** // Build send counts and displacements for element communication // ****************************************************************************************** @@ -1751,20 +1552,6 @@ void build_ghost( // 3) For each locally-owned node gid, lookup ranks that ghost it and record targets. // -------------------------------------------------------------------------------------- - - // Print out the nodes to send by rank for each rank sequentially - // for (int r = 0; r < world_size; r++) { - // if (rank == r) { - // std::cout << "Rank " << rank << " will send data to the following ranks for ghost nodes: "; - // for (const auto& rank_node_pair : nodes_to_send_by_rank) { - // std::cout << rank_node_pair.first << " "; - // } - // std::cout << std::endl; - // } - // MPI_Barrier(MPI_COMM_WORLD); - // } - - // Serialize into a DRaggedRightArrayKokkos CArrayKokkos node_send_strides_array(node_communication_plan.num_send_ranks); for (int i = 0; i < node_communication_plan.num_send_ranks; i++) { @@ -1784,8 +1571,6 @@ void build_ghost( } nodes_to_send_by_rank_rr.update_device(); - - // For each ghost element, determine which nodes need to be received from the owning rank // Build the receive list based on ghost element nodes, not on ghost_node_gids // This ensures we receive all nodes needed by ghost elements @@ -1857,47 +1642,6 @@ void build_ghost( MPI_Barrier(MPI_COMM_WORLD); - // // print the nodes to send by rank rr for each rank sequentially - // for (int r = 0; r < world_size; r++) { - // MPI_Barrier(MPI_COMM_WORLD); - // if (rank == r) { - // std::cout << "Rank " << rank << " will send nodes to the following ranks (nodes_to_send_by_rank_rr):" << std::endl; - // for (int i = 0; i < node_communication_plan.num_send_ranks; i++) { - // int dest_rank = node_communication_plan.send_rank_ids.host(i); - // std::cout << " To rank " << dest_rank << ": ["; - // for (int j = 0; j < nodes_to_send_by_rank[dest_rank].size(); j++) { - // int global_node_id = nodes_to_send_by_rank[dest_rank][j]; - // std::cout << global_node_id << " "; - // } - // std::cout << "]" << std::endl; - // } - // } - // MPI_Barrier(MPI_COMM_WORLD); - // } - - // MPI_Barrier(MPI_COMM_WORLD); - - // // print the nodes to send by rank rr for each rank sequentially - // for (int r = 0; r < world_size; r++) { - // if (rank == r) { - // std::cout << "Rank " << rank << " will receive nodes from the following ranks (nodes_to_recv_by_rank_rr):" << std::endl; - // for (int i = 0; i < node_communication_plan.num_recv_ranks; i++) { - // int source_rank = node_communication_plan.recv_rank_ids.host(i); - // std::cout << " From rank " << source_rank << ": ["; - // for (int j = 0; j < nodes_to_recv_by_rank[source_rank].size(); j++) { - // int node_lid = nodes_to_recv_by_rank[source_rank][j]; - // size_t global_node_id = output_mesh.local_to_global_node_mapping.host(node_lid); - // std::cout << global_node_id << " "; - // } - // std::cout << "]" << std::endl; - // } - // } - // MPI_Barrier(MPI_COMM_WORLD); - // } - - - - node_communication_plan.setup_send_recv(nodes_to_send_by_rank_rr, nodes_to_recv_by_rank_rr); MPI_Barrier(MPI_COMM_WORLD); @@ -2207,9 +1951,6 @@ void partition_mesh( // Other topology options could be substituted above according to your needs (see docs). SCOTCH_archCmplt(&archdat, static_cast(world_size)); - - - // ===================== PT-Scotch Strategy Selection and Documentation ====================== // The PT-Scotch "strategy" (stratdat here) controls the algorithms and heuristics used for partitioning. // You can specify a string or build a strategy using functions that adjust speed, quality, and recursion. @@ -2348,10 +2089,6 @@ void partition_mesh( // New elements owned by this rank int num_new_elems = static_cast(new_elem_gids.size()); - if (print_info) { - std::cout << "[rank " << rank << "] new elems: " << num_new_elems << std::endl; - } - // -------------- Phase 3: Send element–node connectivity -------------- int nodes_per_elem = naive_mesh.num_nodes_in_elem; @@ -2472,13 +2209,9 @@ void partition_mesh( intermediate_mesh.local_to_global_node_mapping.update_device(); intermediate_mesh.local_to_global_elem_mapping.update_device(); - - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout<<" Starting reverse mapping of the element-node connectivity from the global node ids to the local node ids"< Date: Fri, 21 Nov 2025 13:07:03 -0600 Subject: [PATCH 39/52] ENH: Parallelize mesh builder, tidy --- examples/mesh_decomp/mesh_decomp.cpp | 4 - examples/mesh_decomp/mesh_io.h | 149 ++++++++++++++++++--------- 2 files changed, 101 insertions(+), 52 deletions(-) diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp index 5738b21f..9663b306 100644 --- a/examples/mesh_decomp/mesh_decomp.cpp +++ b/examples/mesh_decomp/mesh_decomp.cpp @@ -58,10 +58,6 @@ int main(int argc, char** argv) { std::cout<<"Initializing mesh"< split(std::string s, std::string delimiter) /// \param Number of j indices /// ///////////////////////////////////////////////////////////////////////////// -inline int get_id(int i, int j, int k, int num_i, int num_j) +KOKKOS_INLINE_FUNCTION +size_t get_id(int i, int j, int k, int num_i, int num_j) { return i + j * num_i + k * num_i * num_j; -} +} // end get_id ///////////////////////////////////////////////////////////////////////////// /// @@ -189,20 +190,40 @@ void build_3d_box( // --- Build nodes --- + CArrayDual origin_mtr(3, "origin_mtr"); + origin_mtr(0) = origin[0]; + origin_mtr(1) = origin[1]; + origin_mtr(2) = origin[2]; + origin_mtr.update_device(); + + // populate the point data structures + FOR_ALL(k, 0, num_points_k, + j, 0, num_points_j, + i, 0, num_points_i,{ + + // global id for the point + size_t node_gid = get_id(i, j, k, num_points_i, num_points_j); + + // store the point coordinates + node.coords.host(node_gid, 0) = origin_mtr(0) + (double)i * dx; + node.coords.host(node_gid, 1) = origin_mtr(1) + (double)j * dy; + node.coords.host(node_gid, 2) = origin_mtr(2) + (double)k * dz; + }); + // populate the point data structures - for (int k = 0; k < num_points_k; k++) { - for (int j = 0; j < num_points_j; j++) { - for (int i = 0; i < num_points_i; i++) { - // global id for the point - int node_gid = get_id(i, j, k, num_points_i, num_points_j); + // for (int k = 0; k < num_points_k; k++) { + // for (int j = 0; j < num_points_j; j++) { + // for (int i = 0; i < num_points_i; i++) { + // // global id for the point + // int node_gid = get_id(i, j, k, num_points_i, num_points_j); - // store the point coordinates - node.coords.host(node_gid, 0) = origin[0] + (double)i * dx; - node.coords.host(node_gid, 1) = origin[1] + (double)j * dy; - node.coords.host(node_gid, 2) = origin[2] + (double)k * dz; - } // end for i - } // end for j - } // end for k + // // store the point coordinates + // node.coords.host(node_gid, 0) = origin[0] + (double)i * dx; + // node.coords.host(node_gid, 1) = origin[1] + (double)j * dy; + // node.coords.host(node_gid, 2) = origin[2] + (double)k * dz; + // } // end for i + // } // end for j + // } // end for k node.coords.update_device(); @@ -212,43 +233,75 @@ void build_3d_box( // --- Build elems --- - // populate the elem center data structures - for (int k = 0; k < num_elems_k; k++) { - for (int j = 0; j < num_elems_j; j++) { - for (int i = 0; i < num_elems_i; i++) { - // global id for the elem - int elem_gid = get_id(i, j, k, num_elems_i, num_elems_j); - - // store the point IDs for this elem where the range is - // (i:i+1, j:j+1, k:k+1) for a linear hexahedron - int this_point = 0; - for (int kcount = k; kcount <= k + 1; kcount++) { - for (int jcount = j; jcount <= j + 1; jcount++) { - for (int icount = i; icount <= i + 1; icount++) { - // global id for the points - int node_gid = get_id(icount, jcount, kcount, - num_points_i, num_points_j); - - // convert this_point index to the FE index convention - int this_index = this_point; //convert_point_number_in_Hex(this_point); - - // store the points in this elem according the the finite - // element numbering convention - mesh.nodes_in_elem.host(elem_gid, this_index) = node_gid; - - // increment the point counting index - this_point = this_point + 1; - } // end for icount - } // end for jcount - } // end for kcount - } // end for i - } // end for j - } // end for k + // // populate the elem center data structures + // for (int k = 0; k < num_elems_k; k++) { + // for (int j = 0; j < num_elems_j; j++) { + // for (int i = 0; i < num_elems_i; i++) { + + // // global id for the elem + // int elem_gid = get_id(i, j, k, num_elems_i, num_elems_j); + + // // store the point IDs for this elem where the range is + // // (i:i+1, j:j+1, k:k+1) for a linear hexahedron + // int this_point = 0; + // for (int kcount = k; kcount <= k + 1; kcount++) { + // for (int jcount = j; jcount <= j + 1; jcount++) { + // for (int icount = i; icount <= i + 1; icount++) { + // // global id for the points + // int node_gid = get_id(icount, jcount, kcount, + // num_points_i, num_points_j); + + // // convert this_point index to the FE index convention + // int this_index = this_point; //convert_point_number_in_Hex(this_point); + + // // store the points in this elem according the the finite + // // element numbering convention + // mesh.nodes_in_elem.host(elem_gid, this_index) = node_gid; + + // // increment the point counting index + // this_point = this_point + 1; + // } // end for icount + // } // end for jcount + // } // end for kcount + // } // end for i + // } // end for j + // } // end for k + + // populate the point data structures + FOR_ALL(k, 0, num_elems_k, + j, 0, num_elems_j, + i, 0, num_elems_i,{ + + // global id for the elem + size_t elem_gid = get_id(i, j, k, num_elems_i, num_elems_j); + + // store the point IDs for this elem where the range is + // (i:i+1, j:j+1, k:k+1) for a linear hexahedron + int this_point = 0; + for (int kcount = k; kcount <= k + 1; kcount++) { + for (int jcount = j; jcount <= j + 1; jcount++) { + for (int icount = i; icount <= i + 1; icount++) { + // global id for the points + size_t node_gid = get_id(icount, jcount, kcount, + num_points_i, num_points_j); + + // convert this_point index to the FE index convention + int this_index = this_point; //convert_point_number_in_Hex(this_point); + + // store the points in this elem according the the finite + // element numbering convention + mesh.nodes_in_elem.host(elem_gid, this_index) = node_gid; + + // increment the point counting index + this_point++; + } // end for icount + } // end for jcount + } // end for kcount + }); // end parallel for // update device side mesh.nodes_in_elem.update_device(); - - + Kokkos::fence(); // Build connectivity mesh.build_connectivity(); From b0a1924793343e237e1bf055c809319ed47d35de Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Fri, 21 Nov 2025 16:08:08 -0600 Subject: [PATCH 40/52] STYLE: Tidying up, and reducing memory overhead --- examples/mesh_decomp/decomp_utils.h | 193 +++++++++++---------------- examples/mesh_decomp/mesh_decomp.cpp | 2 +- examples/mesh_decomp/mesh_io.h | 69 ++-------- 3 files changed, 91 insertions(+), 173 deletions(-) diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h index ac1fd6e9..18a3508a 100644 --- a/examples/mesh_decomp/decomp_utils.h +++ b/examples/mesh_decomp/decomp_utils.h @@ -51,8 +51,8 @@ void naive_partition_mesh( node_t& initial_node, Mesh_t& naive_mesh, node_t& naive_node, - std::vector& elems_in_elem_on_rank, - std::vector& num_elems_in_elem_per_rank, + CArrayDual& elems_in_elem_on_rank, + CArrayDual& num_elems_in_elem_per_rank, int world_size, int rank) { @@ -61,25 +61,12 @@ void naive_partition_mesh( int num_elements_on_rank = 0; int num_nodes_on_rank = 0; - int num_nodes_per_elem = 0; - - - std::vector nodes_on_rank; + int num_dim = initial_mesh.num_dims; + // Compute the number of elements to send to each rank and num_nodes_per_elem std::vector elems_per_rank(world_size); // number of elements to send to each rank size(world_size) - std::vector nodes_per_rank(world_size); // number of nodes to send to each rank size(world_size) - - // create a 2D vector of elements to send to each rank - std::vector> elements_to_send(world_size); - - // create a 2D vector of nodes to send to each rank - std::vector> nodes_to_send(world_size); - - // Create a 2D vector to hold the nodal positions on each rank - std::vector> node_pos_to_send(world_size); - if (rank == 0) { num_nodes_per_elem = initial_mesh.num_nodes_in_elem; @@ -107,9 +94,7 @@ void naive_partition_mesh( // int root, MPI_Comm comm) MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout<<" Starting the scatter operation for the element counts per rank"< elements_on_rank(num_elements_on_rank); - MPI_Barrier(MPI_COMM_WORLD); - double t_scatter_end = MPI_Wtime(); + // ******************************************************** // Scatter the actual element global ids to each rank // ******************************************************** - double t_scatter_gids_start = MPI_Wtime(); + // create a 2D vector of elements to send to each rank + std::vector> elements_to_send(world_size); if (rank == 0) { // Populate the elements_to_send array by finding all elements in the elements_per_rank array and adding them to the elements_to_send array - int elem_gid = 0; for (int rank = 0; rank < world_size; rank++) { - for (int j = 0; j < elems_per_rank[rank]; j++) { elements_to_send[rank].push_back(elem_gid); elem_gid++; @@ -181,12 +164,13 @@ void naive_partition_mesh( // Wait for all ranks to complete the scatter operation MPI_Barrier(MPI_COMM_WORLD); - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout<<" After scattering element counts per rank"< nodes_per_rank(world_size); // number of nodes to send to each rank size(world_size) + std::vector nodes_on_rank; // node gids the current rank + std::vector> nodes_to_send(world_size); // nodes to send to each rank + if (rank == 0) { // Populate the nodes_to_send array by finding all nodes in the elements in elements_to_send and removing duplicates @@ -203,40 +187,17 @@ void naive_partition_mesh( for (int i = 0; i < world_size; i++) { nodes_per_rank[i] = nodes_to_send[i].size(); } - - if (print_info) { - std::cout< node_pos_on_rank_flat(num_nodes_on_rank * 3); - - if(rank == 0) - { - for (int i = 0; i < world_size; i++) { - for(int node_gid = 0; node_gid < nodes_to_send[i].size(); node_gid++) - { - node_pos_to_send[i].push_back(initial_node.coords.host(nodes_to_send[i][node_gid], 0)); - node_pos_to_send[i].push_back(initial_node.coords.host(nodes_to_send[i][node_gid], 1)); - node_pos_to_send[i].push_back(initial_node.coords.host(nodes_to_send[i][node_gid], 2)); - } - } + // Create a flat 1D vector for node positions (num_dim coordinates per node) + std::vector node_pos_on_rank_flat(num_nodes_on_rank * num_dim); + CArrayDual node_pos_on_rank(num_nodes_on_rank, num_dim, "node_pos_on_rank_decomp"); + + if(rank == 0){ // Prepare data for MPI_Scatterv (scatter with variable counts) // Flatten the 2D node_pos_to_send into a 1D array @@ -305,29 +258,30 @@ void naive_partition_mesh( int displacement = 0; for (int i = 0; i < world_size; i++) { - sendcounts[i] = nodes_to_send[i].size() * 3; + sendcounts[i] = nodes_to_send[i].size() * num_dim; displs[i] = displacement; // displacement is the starting index of the nodes for the current rank in the flattened array // Copy node positions for rank i to the flattened array - for(int j = 0; j < nodes_to_send[i].size(); j++) { - for(int k = 0; k < 3; k++) { - all_node_pos.push_back(node_pos_to_send[i][j * 3 + k]); + for(int node_gid = 0; node_gid < nodes_to_send[i].size(); node_gid++) { + for(int dim = 0; dim < num_dim; dim++) { + all_node_pos.push_back(initial_node.coords.host(nodes_to_send[i][node_gid], dim)); } } - displacement += nodes_to_send[i].size() * 3; + displacement += nodes_to_send[i].size() * num_dim; } // Send the node positions to each rank MPI_Scatterv(all_node_pos.data(), sendcounts.data(), displs.data(), MPI_DOUBLE, - node_pos_on_rank_flat.data(), num_nodes_on_rank * 3, MPI_DOUBLE, + node_pos_on_rank.host_pointer(), num_nodes_on_rank * num_dim, MPI_DOUBLE, 0, MPI_COMM_WORLD); } else { MPI_Scatterv(nullptr, nullptr, nullptr, MPI_DOUBLE, - node_pos_on_rank_flat.data(), num_nodes_on_rank * 3, MPI_DOUBLE, + node_pos_on_rank.host_pointer(), num_nodes_on_rank * num_dim, MPI_DOUBLE, 0, MPI_COMM_WORLD); } MPI_Barrier(MPI_COMM_WORLD); + node_pos_on_rank.update_device(); // ****************************************************************************************** // Initialize the node state variables @@ -335,16 +289,14 @@ void naive_partition_mesh( // initialize node state variables, for now, we just need coordinates, the rest will be initialize by the respective solvers std::vector required_node_state = { node_state::coords }; - - naive_node.initialize(num_nodes_on_rank, 3, required_node_state); + naive_node.initialize(num_nodes_on_rank, num_dim, required_node_state); - for(int i = 0; i < num_nodes_on_rank; i++) { - naive_node.coords.host(i, 0) = node_pos_on_rank_flat[i*3]; - naive_node.coords.host(i, 1) = node_pos_on_rank_flat[i*3+1]; - naive_node.coords.host(i, 2) = node_pos_on_rank_flat[i*3+2]; - } + FOR_ALL(node_id, 0, num_nodes_on_rank, + dim, 0, num_dim,{ + naive_node.coords(node_id, dim) = node_pos_on_rank(node_id, dim); + }); - naive_node.coords.update_device(); + naive_node.coords.update_host(); // ****************************************************************************************** // Send the element-node connectivity data from the initial mesh to each rank @@ -374,14 +326,17 @@ void naive_partition_mesh( } MPI_Barrier(MPI_COMM_WORLD); - std::vector nodes_in_elem_on_rank_size_t(num_elements_on_rank * num_nodes_per_elem); - MPI_Scatterv(global_nodes_in_elem, conn_sendcounts.data(), conn_displs.data(), MPI_UNSIGNED_LONG_LONG, - nodes_in_elem_on_rank_size_t.data(), nodes_in_elem_on_rank_size_t.size(), MPI_UNSIGNED_LONG_LONG, - 0, MPI_COMM_WORLD); + { //scope to free memory for tmp vector + std::vector nodes_in_elem_on_rank_size_t(num_elements_on_rank * num_nodes_per_elem); + + MPI_Scatterv(global_nodes_in_elem, conn_sendcounts.data(), conn_displs.data(), MPI_UNSIGNED_LONG_LONG, + nodes_in_elem_on_rank_size_t.data(), nodes_in_elem_on_rank_size_t.size(), MPI_UNSIGNED_LONG_LONG, + 0, MPI_COMM_WORLD); - for (size_t idx = 0; idx < nodes_in_elem_on_rank_size_t.size(); ++idx) { - nodes_in_elem_on_rank[idx] = static_cast(nodes_in_elem_on_rank_size_t[idx]); + for (size_t idx = 0; idx < nodes_in_elem_on_rank_size_t.size(); ++idx) { + nodes_in_elem_on_rank[idx] = static_cast(nodes_in_elem_on_rank_size_t[idx]); + } } // ****************************************************************************************** @@ -390,12 +345,10 @@ void naive_partition_mesh( // First, rank 0 computes how many connectivity entries each rank will receive // and scatters that information - std::vector elem_elem_counts(world_size); int total_elem_elem_entries = 0; - - - double t_scatter_elem_elem_start = MPI_Wtime(); + std::vector elem_elem_counts(world_size); + if (rank == 0){ // Calculate total number of connectivity entries for each rank for(int i = 0; i < world_size; i++) { @@ -413,12 +366,11 @@ void naive_partition_mesh( 0, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout<<" After scattering the element-element connectivity data to each rank"<(total_elem_elem_entries, "elems_in_elem_on_rank"); + // Now scatter the num_elems_in_elem for each element on each rank - num_elems_in_elem_per_rank.resize(num_elements_on_rank); + num_elems_in_elem_per_rank = CArrayDual(num_elements_on_rank, "num_elems_in_elem_per_rank"); if (rank == 0) { std::vector all_num_elems_in_elem; @@ -434,14 +386,16 @@ void naive_partition_mesh( } MPI_Scatterv(all_num_elems_in_elem.data(), elems_per_rank.data(), displs_ee.data(), MPI_INT, - num_elems_in_elem_per_rank.data(), num_elements_on_rank, MPI_INT, + num_elems_in_elem_per_rank.host_pointer(), num_elements_on_rank, MPI_INT, 0, MPI_COMM_WORLD); } else { MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT, - num_elems_in_elem_per_rank.data(), num_elements_on_rank, MPI_INT, + num_elems_in_elem_per_rank.host_pointer(), num_elements_on_rank, MPI_INT, 0, MPI_COMM_WORLD); } + num_elems_in_elem_per_rank.update_device(); + if (rank == 0){ // Prepare the element-element connectivity data for each rank std::vector all_elems_in_elem; @@ -465,22 +419,24 @@ void naive_partition_mesh( // Send the element-element connectivity data to each rank using MPI_Scatterv MPI_Scatterv(all_elems_in_elem.data(), sendcounts.data(), displs.data(), MPI_INT, - elems_in_elem_on_rank.data(), total_elem_elem_entries, MPI_INT, + elems_in_elem_on_rank.host_pointer(), total_elem_elem_entries, MPI_INT, 0, MPI_COMM_WORLD); } else { MPI_Scatterv(nullptr, nullptr, nullptr, MPI_INT, - elems_in_elem_on_rank.data(), total_elem_elem_entries, MPI_INT, + elems_in_elem_on_rank.host_pointer(), total_elem_elem_entries, MPI_INT, 0, MPI_COMM_WORLD); } + elems_in_elem_on_rank.update_device(); + MPI_Barrier(MPI_COMM_WORLD); // ****************************************************************************************** // Initialize the naive_mesh data structures for each rank // ****************************************************************************************** naive_mesh.initialize_nodes(num_nodes_on_rank); - naive_mesh.initialize_elems(num_elements_on_rank, 3); + naive_mesh.initialize_elems(num_elements_on_rank, num_dim); naive_mesh.local_to_global_node_mapping = DCArrayKokkos(num_nodes_on_rank, "naive_mesh.local_to_global_node_mapping"); naive_mesh.local_to_global_elem_mapping = DCArrayKokkos(num_elements_on_rank, "naive_mesh.local_to_global_elem_mapping"); @@ -504,12 +460,11 @@ void naive_partition_mesh( // rebuild the local element-node connectivity using the local node ids for(int i = 0; i < num_elements_on_rank; i++) { for(int j = 0; j < num_nodes_per_elem; j++) { - int node_gid = nodes_in_elem_on_rank[i * num_nodes_per_elem + j]; int node_lid = -1; - // Use binary search to find the local node index for node_gid + // Use binary search to find the local node index for node_gid, local_to_global_node_mapping is sorted int left = 0, right = num_nodes_on_rank - 1; while (left <= right) { int mid = left + (right - left) / 2; @@ -541,7 +496,6 @@ void naive_partition_mesh( // ****************************************************************************************** // Build the connectivity for the local naive_mesh // ****************************************************************************************** - naive_mesh.build_connectivity(); MPI_Barrier(MPI_COMM_WORLD); @@ -1689,6 +1643,8 @@ void partition_mesh( bool print_info = false; bool print_vtk = false; + int num_dim = initial_mesh.num_dims; + // Create mesh, gauss points, and node data structures on each rank // This is the initial partitioned mesh Mesh_t naive_mesh; @@ -1698,19 +1654,17 @@ void partition_mesh( Mesh_t intermediate_mesh; node_t intermediate_node; - // Helper arrays to hold element-element connectivity for naive partitioning that include what would be ghost, without having to build the full mesh - std::vector elems_in_elem_on_rank; - std::vector num_elems_in_elem_per_rank; + CArrayDual elems_in_elem_on_rank; + CArrayDual num_elems_in_elem_per_rank; // Perform the naive partitioning of the mesh MPI_Barrier(MPI_COMM_WORLD); if (rank == 0) std::cout << "Performing the naive partitioning of the mesh" << std::endl; - naive_partition_mesh(initial_mesh, initial_node, naive_mesh, naive_node, elems_in_elem_on_rank, num_elems_in_elem_per_rank, world_size, rank); MPI_Barrier(MPI_COMM_WORLD); - if (rank == 0) std::cout << "Naive partitioning of the mesh completed" << std::endl; + if (rank == 0) std::cout << "Begin repartitioning using PT-Scotch" << std::endl; /********************************************************************************** * Build PT-Scotch distributed graph representation of the mesh for repartitioning * @@ -1791,7 +1745,22 @@ void partition_mesh( // edgeloctab: flat array of neighbor global IDs for all local elements, built in order std::vector edgeloctab; - edgeloctab.reserve(vertlocnbr * 6); // heuristic: assume typical mesh degree is ~6, for performance + // edgeloctab holds the flattened list of all neighbors (edges) for all local elements, + // in a compact CSR (Compressed Sparse Row) format expected by PT-Scotch. Each entry is a global element ID + // of a neighbor. The edgeloctab array is built incrementally with one entry per element neighbor edge, + // so we reserve its capacity up front for efficiency. + // + // Heuristic: For unstructured 3D hexahedral meshes, a single element can have significantly more neighbors + // than in 2D cases. In a fully structured 3D grid, each hexahedral element can have up to 26 neighbors + // (since it may touch all surrounding elements along all axes). In unstructured grids, it's possible for some + // elements to have even more neighbors due to mesh irregularities and refinements. + // + // For most practical unstructured hexahedral meshes, values in the low 20s are common, but extreme cases + // (e.g., high-order connectivity, pathological splits, or meshes with "hanging nodes") may see higher counts. + // Using vertlocnbr * 26 as an upper limit is a reasonable estimate for fully connected (structured) cases, + // but consider increasing this if working with highly unstructured or pathological meshes. For safety and + // to avoid repeated reallocations during construction, we use 26 here as a conservative guess. + edgeloctab.reserve(vertlocnbr * 26); // Construct a map from element GID to its offset into elems_in_elem_on_rank (the array of neighbor GIDs) // This allows, for a given element GID, quick lookup of where its neighbor list starts in the flat array. @@ -1800,7 +1769,7 @@ void partition_mesh( for (size_t k = 0; k < naive_mesh.num_elems; k++) { int elem_gid_on_rank = naive_mesh.local_to_global_elem_mapping.host(k); elem_gid_to_offset[elem_gid_on_rank] = current_offset; - current_offset += num_elems_in_elem_per_rank[k]; // WARNING< THIS MUST INCLUDE GHOST< WHICH DONT EXISTS ON THE NAIVE MESH + current_offset += num_elems_in_elem_per_rank(k); } // --- Step 3: Fill in the CSR arrays, looping over each locally-owned element --- @@ -1827,11 +1796,11 @@ void partition_mesh( break; } } - size_t num_nbrs = num_elems_in_elem_per_rank[idx]; + size_t num_nbrs = num_elems_in_elem_per_rank(idx); // Append each neighbor (by its GLOBAL elem GID) to edgeloctab for (size_t j = 0; j < num_nbrs; j++) { - size_t neighbor_gid = elems_in_elem_on_rank[elems_in_elem_offset + j]; // This is a global element ID! + size_t neighbor_gid = elems_in_elem_on_rank(elems_in_elem_offset + j); // This is a global element ID! edgeloctab.push_back(static_cast(neighbor_gid)); ++offset; // Increment running edge count } @@ -2162,9 +2131,9 @@ void partition_mesh( int node_lid = naive_mesh.nodes_in_elem.host(lid, j); int node_gid = naive_mesh.local_to_global_node_mapping.host(node_lid); - node_coords_sendbuf.push_back(naive_node.coords.host(node_lid, 0)); - node_coords_sendbuf.push_back(naive_node.coords.host(node_lid, 1)); - node_coords_sendbuf.push_back(naive_node.coords.host(node_lid, 2)); + for(int dim = 0; dim < num_dim; dim++) { + node_coords_sendbuf.push_back(naive_node.coords.host(node_lid, dim)); + } } } } diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp index 9663b306..a5de7a8b 100644 --- a/examples/mesh_decomp/mesh_decomp.cpp +++ b/examples/mesh_decomp/mesh_decomp.cpp @@ -34,7 +34,7 @@ int main(int argc, char** argv) { // Mesh size double origin[3] = {0.0, 0.0, 0.0}; double length[3] = {1.0, 1.0, 1.0}; - int num_elems_dim[3] = {100, 100, 100}; + int num_elems_dim[3] = {200, 200, 200}; // Initial mesh built on rank zero Mesh_t initial_mesh; diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h index 1a043ca1..a3530a07 100644 --- a/examples/mesh_decomp/mesh_io.h +++ b/examples/mesh_decomp/mesh_io.h @@ -205,68 +205,16 @@ void build_3d_box( size_t node_gid = get_id(i, j, k, num_points_i, num_points_j); // store the point coordinates - node.coords.host(node_gid, 0) = origin_mtr(0) + (double)i * dx; - node.coords.host(node_gid, 1) = origin_mtr(1) + (double)j * dy; - node.coords.host(node_gid, 2) = origin_mtr(2) + (double)k * dz; + node.coords(node_gid, 0) = origin_mtr(0) + (double)i * dx; + node.coords(node_gid, 1) = origin_mtr(1) + (double)j * dy; + node.coords(node_gid, 2) = origin_mtr(2) + (double)k * dz; }); - - // populate the point data structures - // for (int k = 0; k < num_points_k; k++) { - // for (int j = 0; j < num_points_j; j++) { - // for (int i = 0; i < num_points_i; i++) { - // // global id for the point - // int node_gid = get_id(i, j, k, num_points_i, num_points_j); - - // // store the point coordinates - // node.coords.host(node_gid, 0) = origin[0] + (double)i * dx; - // node.coords.host(node_gid, 1) = origin[1] + (double)j * dy; - // node.coords.host(node_gid, 2) = origin[2] + (double)k * dz; - // } // end for i - // } // end for j - // } // end for k - - - node.coords.update_device(); + // Update the host side + node.coords.update_host(); // initialize elem variables mesh.initialize_elems(num_elems, num_dim); - // --- Build elems --- - - // // populate the elem center data structures - // for (int k = 0; k < num_elems_k; k++) { - // for (int j = 0; j < num_elems_j; j++) { - // for (int i = 0; i < num_elems_i; i++) { - - // // global id for the elem - // int elem_gid = get_id(i, j, k, num_elems_i, num_elems_j); - - // // store the point IDs for this elem where the range is - // // (i:i+1, j:j+1, k:k+1) for a linear hexahedron - // int this_point = 0; - // for (int kcount = k; kcount <= k + 1; kcount++) { - // for (int jcount = j; jcount <= j + 1; jcount++) { - // for (int icount = i; icount <= i + 1; icount++) { - // // global id for the points - // int node_gid = get_id(icount, jcount, kcount, - // num_points_i, num_points_j); - - // // convert this_point index to the FE index convention - // int this_index = this_point; //convert_point_number_in_Hex(this_point); - - // // store the points in this elem according the the finite - // // element numbering convention - // mesh.nodes_in_elem.host(elem_gid, this_index) = node_gid; - - // // increment the point counting index - // this_point = this_point + 1; - // } // end for icount - // } // end for jcount - // } // end for kcount - // } // end for i - // } // end for j - // } // end for k - // populate the point data structures FOR_ALL(k, 0, num_elems_k, j, 0, num_elems_j, @@ -290,7 +238,7 @@ void build_3d_box( // store the points in this elem according the the finite // element numbering convention - mesh.nodes_in_elem.host(elem_gid, this_index) = node_gid; + mesh.nodes_in_elem(elem_gid, this_index) = node_gid; // increment the point counting index this_point++; @@ -299,8 +247,9 @@ void build_3d_box( } // end for kcount }); // end parallel for - // update device side - mesh.nodes_in_elem.update_device(); + // Update the host side + mesh.nodes_in_elem.update_host(); + Kokkos::fence(); // Build connectivity From 052aa7c12e7909b1deb47dc6f41b58096c01d3c1 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Mon, 24 Nov 2025 11:30:42 -0600 Subject: [PATCH 41/52] ENH: Adding GPU safety, WIP --- examples/mesh_decomp/decomp_utils.h | 67 ++++++++++++++++++++++------ examples/mesh_decomp/mesh_decomp.cpp | 2 +- 2 files changed, 55 insertions(+), 14 deletions(-) diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h index 18a3508a..617d4014 100644 --- a/examples/mesh_decomp/decomp_utils.h +++ b/examples/mesh_decomp/decomp_utils.h @@ -350,11 +350,18 @@ void naive_partition_mesh( std::vector elem_elem_counts(world_size); if (rank == 0){ + + DCArrayKokkos tmp_num_elems_in_elem(initial_mesh.num_elems, "tmp_elems_in_elem"); + FOR_ALL(i, 0, initial_mesh.num_elems, { + tmp_num_elems_in_elem(i) = initial_mesh.num_elems_in_elem(i); + }); + tmp_num_elems_in_elem.update_host(); + MATAR_FENCE(); // Calculate total number of connectivity entries for each rank for(int i = 0; i < world_size; i++) { elem_elem_counts[i] = 0; for(int k = 0; k < elements_to_send[i].size(); k++) { - elem_elem_counts[i] += initial_mesh.num_elems_in_elem(elements_to_send[i][k]); + elem_elem_counts[i] += tmp_num_elems_in_elem.host(elements_to_send[i][k]); } } } @@ -366,6 +373,7 @@ void naive_partition_mesh( 0, MPI_COMM_WORLD); MPI_Barrier(MPI_COMM_WORLD); + if (rank == 0) std::cout<< " Finished scatter" <(total_elem_elem_entries, "elems_in_elem_on_rank"); @@ -376,12 +384,24 @@ void naive_partition_mesh( std::vector all_num_elems_in_elem; std::vector displs_ee(world_size); int displacement = 0; + + DCArrayKokkos tmp_num_elems_in_elem(initial_mesh.num_elems, "tmp_elems_in_elem"); + FOR_ALL(i, 0, initial_mesh.num_elems, { + tmp_num_elems_in_elem(i) = initial_mesh.num_elems_in_elem(i); + }); + tmp_num_elems_in_elem.update_host(); + MATAR_FENCE(); for(int i = 0; i < world_size; i++) { displs_ee[i] = displacement; + + std::cout<< "Rank = "<< i < all_elems_in_elem; std::vector sendcounts(world_size); std::vector displs(world_size); int displacement = 0; + + DRaggedRightArrayKokkos tmp_elems_in_elem(initial_mesh.num_elems_in_elem, "temp_elem_in_elem"); + + FOR_ALL(elem_gid, 0, initial_mesh.num_elems, { + for (size_t i = 0; i < initial_mesh.num_elems_in_elem(elem_gid); i++) { + tmp_elems_in_elem(elem_gid, i) = initial_mesh.elems_in_elem(elem_gid, i); + } // end for i + }); // end FOR_ALL elems + tmp_elems_in_elem.update_host(); + Kokkos::fence(); + + + DCArrayKokkos tmp_num_elems_in_elem(initial_mesh.num_elems, "tmp_elems_in_elem"); + FOR_ALL(i, 0, initial_mesh.num_elems, { + tmp_num_elems_in_elem(i) = initial_mesh.num_elems_in_elem(i); + }); + tmp_num_elems_in_elem.update_host(); + MATAR_FENCE(); for(int i = 0; i < world_size; i++) { sendcounts[i] = elem_elem_counts[i]; @@ -410,8 +450,8 @@ void naive_partition_mesh( // Copy element-element connectivity for rank i for(int k = 0; k < elements_to_send[i].size(); k++) { - for(int l = 0; l < initial_mesh.num_elems_in_elem(elements_to_send[i][k]); l++) { - all_elems_in_elem.push_back(initial_mesh.elems_in_elem(elements_to_send[i][k], l)); + for(int l = 0; l < tmp_num_elems_in_elem.host(elements_to_send[i][k]); l++) { + all_elems_in_elem.push_back(tmp_elems_in_elem.host(elements_to_send[i][k], l)); } } displacement += elem_elem_counts[i]; @@ -2287,16 +2327,15 @@ void partition_mesh( gauss_point.fields_vec.communicate(); // Loop over all elements and average the values of elements connected to that element - for (int i = 0; i < final_mesh.num_elems; i++) { + FOR_ALL(i, 0, final_mesh.num_elems, { double value = 0.0; for (int j = 0; j < final_mesh.num_elems_in_elem(i); j++) { value += gauss_point.fields.host(final_mesh.elems_in_elem(i, j)); } value /= final_mesh.num_elems_in_elem(i); gauss_point.fields.host(i) = value; - } - for (int i = 0; i < final_mesh.num_elems; i++) { - double value = 0.0; + + value = 0.0; for (int j = 0; j < final_mesh.num_elems_in_elem(i); j++) { value += gauss_point.fields_vec.host(final_mesh.elems_in_elem(i, j), 0); } @@ -2304,7 +2343,8 @@ void partition_mesh( gauss_point.fields_vec.host(i, 0) = value; gauss_point.fields_vec.host(i, 1) = value; gauss_point.fields_vec.host(i, 2) = value; - } + }); + gauss_point.fields_vec.update_device(); @@ -2336,18 +2376,19 @@ void partition_mesh( // Update scalar field to visualize the communication + FOR_ALL(i, 0, final_mesh.num_elems, { - for(int elem_lid = 0; elem_lid < final_mesh.num_elems; elem_lid++) { double value = 0.0; for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) { - value += final_node.scalar_field.host(final_mesh.nodes_in_elem(elem_lid, j)); + value += final_node.scalar_field(final_mesh.nodes_in_elem(elem_lid, j)); } value /= final_mesh.num_nodes_in_elem; for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) { - final_node.scalar_field.host(final_mesh.nodes_in_elem(elem_lid, j)) = value; + final_node.scalar_field(final_mesh.nodes_in_elem(elem_lid, j)) = value; } - } + }); + } #endif // DECOMP_UTILS_H \ No newline at end of file diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp index a5de7a8b..92732a88 100644 --- a/examples/mesh_decomp/mesh_decomp.cpp +++ b/examples/mesh_decomp/mesh_decomp.cpp @@ -34,7 +34,7 @@ int main(int argc, char** argv) { // Mesh size double origin[3] = {0.0, 0.0, 0.0}; double length[3] = {1.0, 1.0, 1.0}; - int num_elems_dim[3] = {200, 200, 200}; + int num_elems_dim[3] = {60, 60, 60}; // Initial mesh built on rank zero Mesh_t initial_mesh; From 4cc2a709ef8b8775eadbb0e4231f96c61db9aec9 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Mon, 24 Nov 2025 14:00:18 -0600 Subject: [PATCH 42/52] BUG: Chasing CUDA build and run bugs --- examples/mesh_decomp/decomp_utils.h | 112 +++++++++++++++------------ examples/mesh_decomp/mesh_decomp.cpp | 2 +- examples/mesh_decomp/mesh_io.h | 27 +++++-- src/include/communication_plan.h | 6 +- 4 files changed, 88 insertions(+), 59 deletions(-) diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h index 617d4014..6eb758ee 100644 --- a/examples/mesh_decomp/decomp_utils.h +++ b/examples/mesh_decomp/decomp_utils.h @@ -398,7 +398,7 @@ void naive_partition_mesh( std::cout<< "Rank = "<< i < owned_coords_send(3*local_owned_count, 0.0); + std::vector owned_coords_send(num_dim*local_owned_count, 0.0); for (int i = 0; i < local_owned_count; i++) { - owned_coords_send[3*i+0] = input_node.coords.host(i,0); - owned_coords_send[3*i+1] = input_node.coords.host(i,1); - owned_coords_send[3*i+2] = input_node.coords.host(i,2); + for(int dim = 0; dim < num_dim; dim++){ + owned_coords_send[num_dim*i+dim] = input_node.coords.host(i,dim); + } } - std::vector all_owned_coords(3 * total_owned, 0.0); + std::vector all_owned_coords(num_dim * total_owned, 0.0); // Create coordinate-specific counts and displacements (in units of doubles, not nodes) + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout << " Getting coord_counts" << std::endl; + std::vector coord_counts(world_size); std::vector coord_displs(world_size); for (int r = 0; r < world_size; r++) { - coord_counts[r] = 3 * owned_counts[r]; // Each node has 3 doubles - coord_displs[r] = 3 * owned_displs[r]; // Displacement in doubles + coord_counts[r] = num_dim * owned_counts[r]; // Each node has num_dim doubles + coord_displs[r] = num_dim * owned_displs[r]; // Displacement in doubles } - MPI_Allgatherv(owned_coords_send.data(), 3*local_owned_count, MPI_DOUBLE, + MPI_Allgatherv(owned_coords_send.data(), num_dim*local_owned_count, MPI_DOUBLE, all_owned_coords.data(), coord_counts.data(), coord_displs.data(), MPI_DOUBLE, MPI_COMM_WORLD); // e) Build map: gid -> coord[3] - std::unordered_map> gid_to_coord; + std::unordered_map> gid_to_coord; for (int i = 0; i < total_owned; i++) { - std::array xyz = { - all_owned_coords[3*i+0], - all_owned_coords[3*i+1], - all_owned_coords[3*i+2] - }; - gid_to_coord[all_owned_gids[i]] = xyz; + std::vector xyz(num_dim); // size is runtime-dependent + for (int dim = 0; dim < num_dim; dim++) { + xyz[dim] = all_owned_coords[num_dim * i + dim]; + } + gid_to_coord[all_owned_gids[i]] = std::move(xyz); } // 4. Finally, fill output_node.coords with correct coordinates. @@ -1239,14 +1242,14 @@ void build_ghost( size_t gid = output_mesh.local_to_global_node_mapping.host(i); auto it = gid_to_coord.find(gid); if (it != gid_to_coord.end()) { - output_node.coords.host(i,0) = it->second[0]; - output_node.coords.host(i,1) = it->second[1]; - output_node.coords.host(i,2) = it->second[2]; + for (int dim = 0; dim < num_dim; dim++) { + output_node.coords.host(i,dim) = it->second[dim]; + } } else { // Could happen if there's a bug: fill with zeros for safety - output_node.coords.host(i,0) = 0.0; - output_node.coords.host(i,1) = 0.0; - output_node.coords.host(i,2) = 0.0; + for (int dim = 0; dim < num_dim; dim++) { + output_node.coords.host(i,dim) = 0.0; + } } } output_node.coords.update_device(); @@ -1314,6 +1317,7 @@ void build_ghost( } MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<"After boundary_elem_targets"< boundary_elem_local_ids; @@ -1354,7 +1358,7 @@ void build_ghost( MPI_Barrier(MPI_COMM_WORLD); output_mesh.num_boundary_elems = boundary_elem_local_ids.size(); - output_mesh.boundary_elem_local_ids = DCArrayKokkos(output_mesh.num_boundary_elems); + output_mesh.boundary_elem_local_ids = DCArrayKokkos(output_mesh.num_boundary_elems, "boundary_elem_local_ids"); for (int i = 0; i < output_mesh.num_boundary_elems; i++) { output_mesh.boundary_elem_local_ids.host(i) = boundary_elem_local_ids[i]; } @@ -1421,7 +1425,7 @@ void build_ghost( // sourceweights: Weights on incoming edges (not used here, set to MPI_UNWEIGHTED) // Could be used to specify communication volume if needed for optimization - int* sourceweights = MPI_UNWEIGHTED; + // int* sourceweights = MPI_UNWEIGHTED; // ---------- Prepare OUTGOING edges (destinations) ---------- // outdegree: Number of ranks to which this rank will SEND data @@ -1448,7 +1452,7 @@ void build_ghost( int* node_sources = (node_indegree > 0) ? ghost_node_receive_ranks_vec.data() : MPI_UNWEIGHTED; // sourceweights: Weights on incoming edges (not used here, set to MPI_UNWEIGHTED) - int* node_sourceweights = MPI_UNWEIGHTED; + //int* node_sourceweights = MPI_UNWEIGHTED; // ---------- Prepare OUTGOING edges (destinations) ---------- // outdegree: Number of ranks to which this rank will SEND data @@ -1457,11 +1461,12 @@ void build_ghost( int* node_destinations = (node_outdegree > 0) ? ghost_node_send_ranks_vec.data() : MPI_UNWEIGHTED; // destinationweights: Weights on outgoing edges (not used here, set to MPI_UNWEIGHTED) - int* node_destinationweights = MPI_UNWEIGHTED; + // int* node_destinationweights = MPI_UNWEIGHTED; // Initialize the graph communicator for node communication node_communication_plan.initialize_graph_communicator(node_outdegree, node_destinations, node_indegree, node_sources); MPI_Barrier(MPI_COMM_WORLD); + if (rank == 0) std::cout<<"After node graph communicator"< strides_array(element_communication_plan.num_send_ranks); + DCArrayKokkos strides_array(element_communication_plan.num_send_ranks, "strides_for_elems_to_send"); for (int i = 0; i < element_communication_plan.num_send_ranks; i++) { int dest_rank = element_communication_plan.send_rank_ids.host(i); - strides_array(i) = elems_to_send_by_rank[dest_rank].size(); + strides_array.host(i) = elems_to_send_by_rank[dest_rank].size(); } + strides_array.update_device(); DRaggedRightArrayKokkos elems_to_send_by_rank_rr(strides_array, "elems_to_send_by_rank"); // Fill in the data @@ -1517,12 +1523,13 @@ void build_ghost( } // ========== Serialize into a DRaggedRightArrayKokkos ========== - CArrayKokkos elem_recv_strides_array(element_communication_plan.num_recv_ranks); + DCArrayKokkos elem_recv_strides_array(element_communication_plan.num_recv_ranks, "elem_recv_strides_array"); for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) { int source_rank = element_communication_plan.recv_rank_ids.host(i); - elem_recv_strides_array(i) = elems_to_recv_by_rank[source_rank].size(); + elem_recv_strides_array.host(i) = elems_to_recv_by_rank[source_rank].size(); } + elem_recv_strides_array.update_device(); DRaggedRightArrayKokkos elems_to_recv_by_rank_rr(elem_recv_strides_array, "elems_to_recv_by_rank"); // Fill in the data for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) { @@ -1532,6 +1539,7 @@ void build_ghost( } } elems_to_recv_by_rank_rr.update_device(); + MATAR_FENCE(); element_communication_plan.setup_send_recv(elems_to_send_by_rank_rr, elems_to_recv_by_rank_rr); MPI_Barrier(MPI_COMM_WORLD); @@ -1547,11 +1555,12 @@ void build_ghost( // -------------------------------------------------------------------------------------- // Serialize into a DRaggedRightArrayKokkos - CArrayKokkos node_send_strides_array(node_communication_plan.num_send_ranks); + DCArrayKokkos node_send_strides_array(node_communication_plan.num_send_ranks,"node_send_strides_array"); for (int i = 0; i < node_communication_plan.num_send_ranks; i++) { int dest_rank = node_communication_plan.send_rank_ids.host(i); - node_send_strides_array(i) = nodes_to_send_by_rank[dest_rank].size(); + node_send_strides_array.host(i) = nodes_to_send_by_rank[dest_rank].size(); } + node_send_strides_array.update_device(); DRaggedRightArrayKokkos nodes_to_send_by_rank_rr(node_send_strides_array, "nodes_to_send_by_rank"); // Fill in the data @@ -1616,11 +1625,12 @@ void build_ghost( } // Serialize into a DRaggedRightArrayKokkos - CArrayKokkos nodes_recv_strides_array(node_communication_plan.num_recv_ranks); + DCArrayKokkos nodes_recv_strides_array(node_communication_plan.num_recv_ranks, "nodes_recv_strides_array"); for (int i = 0; i < node_communication_plan.num_recv_ranks; i++) { int source_rank = node_communication_plan.recv_rank_ids.host(i); - nodes_recv_strides_array(i) = nodes_to_recv_by_rank[source_rank].size(); + nodes_recv_strides_array.host(i) = nodes_to_recv_by_rank[source_rank].size(); } + nodes_recv_strides_array.update_device(); DRaggedRightArrayKokkos nodes_to_recv_by_rank_rr(nodes_recv_strides_array, "nodes_to_recv_by_rank"); // Fill in the data for (int i = 0; i < node_communication_plan.num_recv_ranks; i++) { @@ -1681,7 +1691,7 @@ void partition_mesh( int rank){ bool print_info = false; - bool print_vtk = false; + // bool print_vtk = false; int num_dim = initial_mesh.num_dims; @@ -1809,7 +1819,7 @@ void partition_mesh( for (size_t k = 0; k < naive_mesh.num_elems; k++) { int elem_gid_on_rank = naive_mesh.local_to_global_elem_mapping.host(k); elem_gid_to_offset[elem_gid_on_rank] = current_offset; - current_offset += num_elems_in_elem_per_rank(k); + current_offset += num_elems_in_elem_per_rank.host(k); } // --- Step 3: Fill in the CSR arrays, looping over each locally-owned element --- @@ -1836,11 +1846,11 @@ void partition_mesh( break; } } - size_t num_nbrs = num_elems_in_elem_per_rank(idx); + size_t num_nbrs = num_elems_in_elem_per_rank.host(idx); // Append each neighbor (by its GLOBAL elem GID) to edgeloctab for (size_t j = 0; j < num_nbrs; j++) { - size_t neighbor_gid = elems_in_elem_on_rank(elems_in_elem_offset + j); // This is a global element ID! + size_t neighbor_gid = elems_in_elem_on_rank.host(elems_in_elem_offset + j); // This is a global element ID! edgeloctab.push_back(static_cast(neighbor_gid)); ++offset; // Increment running edge count } @@ -2206,8 +2216,8 @@ void partition_mesh( // -------------- Phase 6: Build the intermediate_mesh -------------- intermediate_mesh.initialize_nodes(num_new_nodes); intermediate_mesh.initialize_elems(num_new_elems, naive_mesh.num_dims); - intermediate_mesh.local_to_global_node_mapping = DCArrayKokkos(num_new_nodes); - intermediate_mesh.local_to_global_elem_mapping = DCArrayKokkos(num_new_elems); + intermediate_mesh.local_to_global_node_mapping = DCArrayKokkos(num_new_nodes, "intermediate_mesh.local_to_global_node_mapping"); + intermediate_mesh.local_to_global_elem_mapping = DCArrayKokkos(num_new_elems, "intermediate_mesh.local_to_global_elem_mapping"); // Fill global mappings for (int i = 0; i < num_new_nodes; i++) @@ -2330,19 +2340,19 @@ void partition_mesh( FOR_ALL(i, 0, final_mesh.num_elems, { double value = 0.0; for (int j = 0; j < final_mesh.num_elems_in_elem(i); j++) { - value += gauss_point.fields.host(final_mesh.elems_in_elem(i, j)); + value += gauss_point.fields(final_mesh.elems_in_elem(i, j)); } value /= final_mesh.num_elems_in_elem(i); - gauss_point.fields.host(i) = value; + gauss_point.fields(i) = value; value = 0.0; for (int j = 0; j < final_mesh.num_elems_in_elem(i); j++) { - value += gauss_point.fields_vec.host(final_mesh.elems_in_elem(i, j), 0); + value += gauss_point.fields_vec(final_mesh.elems_in_elem(i, j), 0); } value /= final_mesh.num_elems_in_elem(i); - gauss_point.fields_vec.host(i, 0) = value; - gauss_point.fields_vec.host(i, 1) = value; - gauss_point.fields_vec.host(i, 2) = value; + gauss_point.fields_vec(i, 0) = value; + gauss_point.fields_vec(i, 1) = value; + gauss_point.fields_vec(i, 2) = value; }); gauss_point.fields_vec.update_device(); @@ -2380,12 +2390,12 @@ void partition_mesh( double value = 0.0; for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) { - value += final_node.scalar_field(final_mesh.nodes_in_elem(elem_lid, j)); + value += final_node.scalar_field(final_mesh.nodes_in_elem(i, j)); } value /= final_mesh.num_nodes_in_elem; for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) { - final_node.scalar_field(final_mesh.nodes_in_elem(elem_lid, j)) = value; + final_node.scalar_field(final_mesh.nodes_in_elem(i, j)) = value; } }); diff --git a/examples/mesh_decomp/mesh_decomp.cpp b/examples/mesh_decomp/mesh_decomp.cpp index 92732a88..a580052a 100644 --- a/examples/mesh_decomp/mesh_decomp.cpp +++ b/examples/mesh_decomp/mesh_decomp.cpp @@ -34,7 +34,7 @@ int main(int argc, char** argv) { // Mesh size double origin[3] = {0.0, 0.0, 0.0}; double length[3] = {1.0, 1.0, 1.0}; - int num_elems_dim[3] = {60, 60, 60}; + int num_elems_dim[3] = {20, 20, 20}; // Initial mesh built on rank zero Mesh_t initial_mesh; diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h index a3530a07..04c2cad1 100644 --- a/examples/mesh_decomp/mesh_io.h +++ b/examples/mesh_decomp/mesh_io.h @@ -191,12 +191,13 @@ void build_3d_box( // --- Build nodes --- CArrayDual origin_mtr(3, "origin_mtr"); - origin_mtr(0) = origin[0]; - origin_mtr(1) = origin[1]; - origin_mtr(2) = origin[2]; + origin_mtr.host(0) = origin[0]; + origin_mtr.host(1) = origin[1]; + origin_mtr.host(2) = origin[2]; origin_mtr.update_device(); // populate the point data structures + std::cout<<"First FOR_ALL"<(num_elems, num_cell_scalar_vars); auto elem_vec_fields = CArray(num_elems, num_cell_vec_vars, 3); + + DCArrayKokkos num_elems_in_elem(mesh.num_elems, "tmp_num_elem_in_elem"); + FOR_ALL(i, 0, mesh.num_elems, { + num_elems_in_elem(i) = (double)mesh.num_elems_in_elem(i); + }); + MATAR_FENCE(); + num_elems_in_elem.update_host(); for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { elem_fields(elem_gid, 0) = rank; - elem_fields(elem_gid, 1) = (double)mesh.num_elems_in_elem(elem_gid); + elem_fields(elem_gid, 1) = num_elems_in_elem.host(elem_gid); elem_fields(elem_gid, 2) = mesh.local_to_global_elem_mapping.host(elem_gid); elem_fields(elem_gid, 3) = gauss_point.fields.host(elem_gid); elem_vec_fields(elem_gid, 0, 0) = gauss_point.fields_vec.host(elem_gid, 0); @@ -594,6 +603,14 @@ void write_vtu(Mesh_t& mesh, CArray vec_fields(num_nodes, num_point_vec_vars, 3); CArray point_scalar_fields(num_nodes, num_point_scalar_vars); + + DCArrayKokkos num_elems_in_node(mesh.num_elems, "tmp_num_elems_in_node"); + FOR_ALL(i, 0, mesh.num_elems, { + num_elems_in_node(i) = (double)mesh.num_corners_in_node(i); + }); + MATAR_FENCE(); + num_elems_in_node.update_host(); + for (size_t node_gid = 0; node_gid < num_nodes; node_gid++) { // position, var 0 vec_fields(node_gid, 0, 0) = node.coords.host(node_gid, 0); @@ -606,7 +623,7 @@ void write_vtu(Mesh_t& mesh, vec_fields(node_gid, 1, 2) = node.vector_field.host(node_gid, 2); point_scalar_fields(node_gid, 0) = rank; - point_scalar_fields(node_gid, 1) = (double)mesh.num_corners_in_node(node_gid); + point_scalar_fields(node_gid, 1) = num_elems_in_node.host(node_gid); point_scalar_fields(node_gid, 2) = (double)mesh.local_to_global_node_mapping.host(node_gid); point_scalar_fields(node_gid, 3) = node.scalar_field.host(node_gid); } diff --git a/src/include/communication_plan.h b/src/include/communication_plan.h index 3c1c48e9..21091eb2 100644 --- a/src/include/communication_plan.h +++ b/src/include/communication_plan.h @@ -187,14 +187,16 @@ enum class communication_plan_type { // Copy and store send neighbor IDs (out-bound neighbors: where we will send data to) this->send_rank_ids = DCArrayKokkos(num_send_ranks, "send_rank_ids"); for(int i = 0; i < num_send_ranks; i++){ - this->send_rank_ids(i) = send_rank_ids[i]; + this->send_rank_ids.host(i) = send_rank_ids[i]; } + this->send_rank_ids.update_device(); // Copy and store receive neighbor IDs (in-bound neighbors: where we will receive data from) this->recv_rank_ids = DCArrayKokkos(num_recv_ranks, "recv_rank_ids"); for(int i = 0; i < num_recv_ranks; i++){ - this->recv_rank_ids(i) = recv_rank_ids[i]; + this->recv_rank_ids.host(i) = recv_rank_ids[i]; } + this->recv_rank_ids.update_device(); // Create the distributed graph communicator. // This call links this process to its explicit send and receive neighbors. From 6ac9606899a516416cead8bbf9810bf27b84fa1b Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Mon, 24 Nov 2025 14:10:51 -0600 Subject: [PATCH 43/52] BUG: Fixed GPU build, broke node comms, WIP --- examples/mesh_decomp/decomp_utils.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h index 6eb758ee..d43aa497 100644 --- a/examples/mesh_decomp/decomp_utils.h +++ b/examples/mesh_decomp/decomp_utils.h @@ -2354,8 +2354,8 @@ void partition_mesh( gauss_point.fields_vec(i, 1) = value; gauss_point.fields_vec(i, 2) = value; }); - - gauss_point.fields_vec.update_device(); + gauss_point.fields.update_host(); + gauss_point.fields_vec.update_host(); @@ -2398,7 +2398,7 @@ void partition_mesh( final_node.scalar_field(final_mesh.nodes_in_elem(i, j)) = value; } }); - + final_node.scalar_field.update_host(); } #endif // DECOMP_UTILS_H \ No newline at end of file From 9217adb7017d2d839eafdbb838a5dd593d85d86c Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Mon, 24 Nov 2025 15:14:13 -0600 Subject: [PATCH 44/52] BUG: Chasing nodal comms bug --- examples/mesh_decomp/decomp_utils.h | 139 ++++++++++++++-------------- 1 file changed, 68 insertions(+), 71 deletions(-) diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h index d43aa497..e0600761 100644 --- a/examples/mesh_decomp/decomp_utils.h +++ b/examples/mesh_decomp/decomp_utils.h @@ -398,7 +398,7 @@ void naive_partition_mesh( std::cout<< "Rank = "<< i < owned_coords_send(num_dim*local_owned_count, 0.0); + std::vector owned_coords_send(3*local_owned_count, 0.0); for (int i = 0; i < local_owned_count; i++) { - for(int dim = 0; dim < num_dim; dim++){ - owned_coords_send[num_dim*i+dim] = input_node.coords.host(i,dim); - } + owned_coords_send[3*i+0] = input_node.coords.host(i,0); + owned_coords_send[3*i+1] = input_node.coords.host(i,1); + owned_coords_send[3*i+2] = input_node.coords.host(i,2); } - std::vector all_owned_coords(num_dim * total_owned, 0.0); + std::vector all_owned_coords(3 * total_owned, 0.0); // Create coordinate-specific counts and displacements (in units of doubles, not nodes) - MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout << " Getting coord_counts" << std::endl; - std::vector coord_counts(world_size); std::vector coord_displs(world_size); for (int r = 0; r < world_size; r++) { - coord_counts[r] = num_dim * owned_counts[r]; // Each node has num_dim doubles - coord_displs[r] = num_dim * owned_displs[r]; // Displacement in doubles + coord_counts[r] = 3 * owned_counts[r]; // Each node has 3 doubles + coord_displs[r] = 3 * owned_displs[r]; // Displacement in doubles } - MPI_Allgatherv(owned_coords_send.data(), num_dim*local_owned_count, MPI_DOUBLE, + MPI_Allgatherv(owned_coords_send.data(), 3*local_owned_count, MPI_DOUBLE, all_owned_coords.data(), coord_counts.data(), coord_displs.data(), MPI_DOUBLE, MPI_COMM_WORLD); // e) Build map: gid -> coord[3] - std::unordered_map> gid_to_coord; + std::unordered_map> gid_to_coord; for (int i = 0; i < total_owned; i++) { - std::vector xyz(num_dim); // size is runtime-dependent - for (int dim = 0; dim < num_dim; dim++) { - xyz[dim] = all_owned_coords[num_dim * i + dim]; - } - gid_to_coord[all_owned_gids[i]] = std::move(xyz); + std::array xyz = { + all_owned_coords[3*i+0], + all_owned_coords[3*i+1], + all_owned_coords[3*i+2] + }; + gid_to_coord[all_owned_gids[i]] = xyz; } // 4. Finally, fill output_node.coords with correct coordinates. @@ -1242,14 +1239,14 @@ void build_ghost( size_t gid = output_mesh.local_to_global_node_mapping.host(i); auto it = gid_to_coord.find(gid); if (it != gid_to_coord.end()) { - for (int dim = 0; dim < num_dim; dim++) { - output_node.coords.host(i,dim) = it->second[dim]; - } + output_node.coords.host(i,0) = it->second[0]; + output_node.coords.host(i,1) = it->second[1]; + output_node.coords.host(i,2) = it->second[2]; } else { // Could happen if there's a bug: fill with zeros for safety - for (int dim = 0; dim < num_dim; dim++) { - output_node.coords.host(i,dim) = 0.0; - } + output_node.coords.host(i,0) = 0.0; + output_node.coords.host(i,1) = 0.0; + output_node.coords.host(i,2) = 0.0; } } output_node.coords.update_device(); @@ -1317,7 +1314,6 @@ void build_ghost( } MPI_Barrier(MPI_COMM_WORLD); - if(rank == 0) std::cout<<"After boundary_elem_targets"< boundary_elem_local_ids; @@ -1358,7 +1354,7 @@ void build_ghost( MPI_Barrier(MPI_COMM_WORLD); output_mesh.num_boundary_elems = boundary_elem_local_ids.size(); - output_mesh.boundary_elem_local_ids = DCArrayKokkos(output_mesh.num_boundary_elems, "boundary_elem_local_ids"); + output_mesh.boundary_elem_local_ids = DCArrayKokkos(output_mesh.num_boundary_elems); for (int i = 0; i < output_mesh.num_boundary_elems; i++) { output_mesh.boundary_elem_local_ids.host(i) = boundary_elem_local_ids[i]; } @@ -1425,7 +1421,7 @@ void build_ghost( // sourceweights: Weights on incoming edges (not used here, set to MPI_UNWEIGHTED) // Could be used to specify communication volume if needed for optimization - // int* sourceweights = MPI_UNWEIGHTED; + int* sourceweights = MPI_UNWEIGHTED; // ---------- Prepare OUTGOING edges (destinations) ---------- // outdegree: Number of ranks to which this rank will SEND data @@ -1452,7 +1448,7 @@ void build_ghost( int* node_sources = (node_indegree > 0) ? ghost_node_receive_ranks_vec.data() : MPI_UNWEIGHTED; // sourceweights: Weights on incoming edges (not used here, set to MPI_UNWEIGHTED) - //int* node_sourceweights = MPI_UNWEIGHTED; + int* node_sourceweights = MPI_UNWEIGHTED; // ---------- Prepare OUTGOING edges (destinations) ---------- // outdegree: Number of ranks to which this rank will SEND data @@ -1461,12 +1457,11 @@ void build_ghost( int* node_destinations = (node_outdegree > 0) ? ghost_node_send_ranks_vec.data() : MPI_UNWEIGHTED; // destinationweights: Weights on outgoing edges (not used here, set to MPI_UNWEIGHTED) - // int* node_destinationweights = MPI_UNWEIGHTED; + int* node_destinationweights = MPI_UNWEIGHTED; // Initialize the graph communicator for node communication node_communication_plan.initialize_graph_communicator(node_outdegree, node_destinations, node_indegree, node_sources); MPI_Barrier(MPI_COMM_WORLD); - if (rank == 0) std::cout<<"After node graph communicator"< strides_array(element_communication_plan.num_send_ranks, "strides_for_elems_to_send"); + CArrayKokkos strides_array(element_communication_plan.num_send_ranks); for (int i = 0; i < element_communication_plan.num_send_ranks; i++) { int dest_rank = element_communication_plan.send_rank_ids.host(i); - strides_array.host(i) = elems_to_send_by_rank[dest_rank].size(); + strides_array(i) = elems_to_send_by_rank[dest_rank].size(); } - strides_array.update_device(); DRaggedRightArrayKokkos elems_to_send_by_rank_rr(strides_array, "elems_to_send_by_rank"); // Fill in the data @@ -1523,13 +1517,12 @@ void build_ghost( } // ========== Serialize into a DRaggedRightArrayKokkos ========== - DCArrayKokkos elem_recv_strides_array(element_communication_plan.num_recv_ranks, "elem_recv_strides_array"); + CArrayKokkos elem_recv_strides_array(element_communication_plan.num_recv_ranks); for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) { int source_rank = element_communication_plan.recv_rank_ids.host(i); - elem_recv_strides_array.host(i) = elems_to_recv_by_rank[source_rank].size(); + elem_recv_strides_array(i) = elems_to_recv_by_rank[source_rank].size(); } - elem_recv_strides_array.update_device(); DRaggedRightArrayKokkos elems_to_recv_by_rank_rr(elem_recv_strides_array, "elems_to_recv_by_rank"); // Fill in the data for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) { @@ -1539,7 +1532,6 @@ void build_ghost( } } elems_to_recv_by_rank_rr.update_device(); - MATAR_FENCE(); element_communication_plan.setup_send_recv(elems_to_send_by_rank_rr, elems_to_recv_by_rank_rr); MPI_Barrier(MPI_COMM_WORLD); @@ -1555,12 +1547,11 @@ void build_ghost( // -------------------------------------------------------------------------------------- // Serialize into a DRaggedRightArrayKokkos - DCArrayKokkos node_send_strides_array(node_communication_plan.num_send_ranks,"node_send_strides_array"); + CArrayKokkos node_send_strides_array(node_communication_plan.num_send_ranks); for (int i = 0; i < node_communication_plan.num_send_ranks; i++) { int dest_rank = node_communication_plan.send_rank_ids.host(i); - node_send_strides_array.host(i) = nodes_to_send_by_rank[dest_rank].size(); + node_send_strides_array(i) = nodes_to_send_by_rank[dest_rank].size(); } - node_send_strides_array.update_device(); DRaggedRightArrayKokkos nodes_to_send_by_rank_rr(node_send_strides_array, "nodes_to_send_by_rank"); // Fill in the data @@ -1625,12 +1616,11 @@ void build_ghost( } // Serialize into a DRaggedRightArrayKokkos - DCArrayKokkos nodes_recv_strides_array(node_communication_plan.num_recv_ranks, "nodes_recv_strides_array"); + CArrayKokkos nodes_recv_strides_array(node_communication_plan.num_recv_ranks); for (int i = 0; i < node_communication_plan.num_recv_ranks; i++) { int source_rank = node_communication_plan.recv_rank_ids.host(i); - nodes_recv_strides_array.host(i) = nodes_to_recv_by_rank[source_rank].size(); + nodes_recv_strides_array(i) = nodes_to_recv_by_rank[source_rank].size(); } - nodes_recv_strides_array.update_device(); DRaggedRightArrayKokkos nodes_to_recv_by_rank_rr(nodes_recv_strides_array, "nodes_to_recv_by_rank"); // Fill in the data for (int i = 0; i < node_communication_plan.num_recv_ranks; i++) { @@ -1691,7 +1681,7 @@ void partition_mesh( int rank){ bool print_info = false; - // bool print_vtk = false; + bool print_vtk = false; int num_dim = initial_mesh.num_dims; @@ -1819,7 +1809,7 @@ void partition_mesh( for (size_t k = 0; k < naive_mesh.num_elems; k++) { int elem_gid_on_rank = naive_mesh.local_to_global_elem_mapping.host(k); elem_gid_to_offset[elem_gid_on_rank] = current_offset; - current_offset += num_elems_in_elem_per_rank.host(k); + current_offset += num_elems_in_elem_per_rank(k); } // --- Step 3: Fill in the CSR arrays, looping over each locally-owned element --- @@ -1846,11 +1836,11 @@ void partition_mesh( break; } } - size_t num_nbrs = num_elems_in_elem_per_rank.host(idx); + size_t num_nbrs = num_elems_in_elem_per_rank(idx); // Append each neighbor (by its GLOBAL elem GID) to edgeloctab for (size_t j = 0; j < num_nbrs; j++) { - size_t neighbor_gid = elems_in_elem_on_rank.host(elems_in_elem_offset + j); // This is a global element ID! + size_t neighbor_gid = elems_in_elem_on_rank(elems_in_elem_offset + j); // This is a global element ID! edgeloctab.push_back(static_cast(neighbor_gid)); ++offset; // Increment running edge count } @@ -2216,8 +2206,8 @@ void partition_mesh( // -------------- Phase 6: Build the intermediate_mesh -------------- intermediate_mesh.initialize_nodes(num_new_nodes); intermediate_mesh.initialize_elems(num_new_elems, naive_mesh.num_dims); - intermediate_mesh.local_to_global_node_mapping = DCArrayKokkos(num_new_nodes, "intermediate_mesh.local_to_global_node_mapping"); - intermediate_mesh.local_to_global_elem_mapping = DCArrayKokkos(num_new_elems, "intermediate_mesh.local_to_global_elem_mapping"); + intermediate_mesh.local_to_global_node_mapping = DCArrayKokkos(num_new_nodes); + intermediate_mesh.local_to_global_elem_mapping = DCArrayKokkos(num_new_elems); // Fill global mappings for (int i = 0; i < num_new_nodes; i++) @@ -2325,10 +2315,10 @@ void partition_mesh( gauss_point.fields_vec.host(i, 2) = static_cast(rank); } for (int i = final_mesh.num_owned_elems; i < final_mesh.num_elems; i++) { - gauss_point.fields.host(i) = -1.0; // Ghost elements should be updated - gauss_point.fields_vec.host(i, 0) = -1.0; - gauss_point.fields_vec.host(i, 1) = -1.0; - gauss_point.fields_vec.host(i, 2) = -1.0; + gauss_point.fields.host(i) = -100.0; // Ghost elements should be updated + gauss_point.fields_vec.host(i, 0) = -100.0; + gauss_point.fields_vec.host(i, 1) = -100.0; + gauss_point.fields_vec.host(i, 2) = -100.0; } gauss_point.fields.update_device(); gauss_point.fields_vec.update_device(); @@ -2337,25 +2327,32 @@ void partition_mesh( gauss_point.fields_vec.communicate(); // Loop over all elements and average the values of elements connected to that element - FOR_ALL(i, 0, final_mesh.num_elems, { + + CArrayKokkos tmp_store(final_mesh.num_elems); + FOR_ALL(i, 0, final_mesh.num_owned_elems, { double value = 0.0; for (int j = 0; j < final_mesh.num_elems_in_elem(i); j++) { - value += gauss_point.fields(final_mesh.elems_in_elem(i, j)); + value += gauss_point.fields.host(final_mesh.elems_in_elem(i, j)); } value /= final_mesh.num_elems_in_elem(i); - gauss_point.fields(i) = value; + tmp_store(i) = value; + // gauss_point.fields(i) = value; value = 0.0; for (int j = 0; j < final_mesh.num_elems_in_elem(i); j++) { - value += gauss_point.fields_vec(final_mesh.elems_in_elem(i, j), 0); + value += gauss_point.fields_vec.host(final_mesh.elems_in_elem(i, j), 0); } value /= final_mesh.num_elems_in_elem(i); - gauss_point.fields_vec(i, 0) = value; - gauss_point.fields_vec(i, 1) = value; - gauss_point.fields_vec(i, 2) = value; + gauss_point.fields_vec.host(i, 0) = value; + gauss_point.fields_vec.host(i, 1) = value; + gauss_point.fields_vec.host(i, 2) = value; }); - gauss_point.fields.update_host(); - gauss_point.fields_vec.update_host(); + + FOR_ALL(i, 0, final_mesh.num_owned_elems, { + gauss_point.fields(i) = tmp_store(i); + }); + + gauss_point.fields_vec.update_device(); @@ -2370,10 +2367,10 @@ void partition_mesh( final_node.vector_field.host(i, 2) = static_cast(rank); } for (int i = final_mesh.num_owned_nodes; i < final_mesh.num_nodes; i++) { - final_node.scalar_field.host(i) = -1.0; - final_node.vector_field.host(i, 0) = -1.0; - final_node.vector_field.host(i, 1) = -1.0; - final_node.vector_field.host(i, 2) = -1.0; + final_node.scalar_field.host(i) = -100.0; + final_node.vector_field.host(i, 0) = -100.0; + final_node.vector_field.host(i, 1) = -100.0; + final_node.vector_field.host(i, 2) = -100.0; } final_node.coords.update_device(); @@ -2398,7 +2395,7 @@ void partition_mesh( final_node.scalar_field(final_mesh.nodes_in_elem(i, j)) = value; } }); - final_node.scalar_field.update_host(); + } #endif // DECOMP_UTILS_H \ No newline at end of file From e8602a2844274cfbdae6607b19364dfc4af7cacf Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Mon, 24 Nov 2025 15:47:20 -0600 Subject: [PATCH 45/52] Revert "BUG: Chasing nodal comms bug" This reverts commit 9217adb7017d2d839eafdbb838a5dd593d85d86c. --- examples/mesh_decomp/decomp_utils.h | 139 ++++++++++++++-------------- 1 file changed, 71 insertions(+), 68 deletions(-) diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h index e0600761..d43aa497 100644 --- a/examples/mesh_decomp/decomp_utils.h +++ b/examples/mesh_decomp/decomp_utils.h @@ -398,7 +398,7 @@ void naive_partition_mesh( std::cout<< "Rank = "<< i < owned_coords_send(3*local_owned_count, 0.0); + std::vector owned_coords_send(num_dim*local_owned_count, 0.0); for (int i = 0; i < local_owned_count; i++) { - owned_coords_send[3*i+0] = input_node.coords.host(i,0); - owned_coords_send[3*i+1] = input_node.coords.host(i,1); - owned_coords_send[3*i+2] = input_node.coords.host(i,2); + for(int dim = 0; dim < num_dim; dim++){ + owned_coords_send[num_dim*i+dim] = input_node.coords.host(i,dim); + } } - std::vector all_owned_coords(3 * total_owned, 0.0); + std::vector all_owned_coords(num_dim * total_owned, 0.0); // Create coordinate-specific counts and displacements (in units of doubles, not nodes) + MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout << " Getting coord_counts" << std::endl; + std::vector coord_counts(world_size); std::vector coord_displs(world_size); for (int r = 0; r < world_size; r++) { - coord_counts[r] = 3 * owned_counts[r]; // Each node has 3 doubles - coord_displs[r] = 3 * owned_displs[r]; // Displacement in doubles + coord_counts[r] = num_dim * owned_counts[r]; // Each node has num_dim doubles + coord_displs[r] = num_dim * owned_displs[r]; // Displacement in doubles } - MPI_Allgatherv(owned_coords_send.data(), 3*local_owned_count, MPI_DOUBLE, + MPI_Allgatherv(owned_coords_send.data(), num_dim*local_owned_count, MPI_DOUBLE, all_owned_coords.data(), coord_counts.data(), coord_displs.data(), MPI_DOUBLE, MPI_COMM_WORLD); // e) Build map: gid -> coord[3] - std::unordered_map> gid_to_coord; + std::unordered_map> gid_to_coord; for (int i = 0; i < total_owned; i++) { - std::array xyz = { - all_owned_coords[3*i+0], - all_owned_coords[3*i+1], - all_owned_coords[3*i+2] - }; - gid_to_coord[all_owned_gids[i]] = xyz; + std::vector xyz(num_dim); // size is runtime-dependent + for (int dim = 0; dim < num_dim; dim++) { + xyz[dim] = all_owned_coords[num_dim * i + dim]; + } + gid_to_coord[all_owned_gids[i]] = std::move(xyz); } // 4. Finally, fill output_node.coords with correct coordinates. @@ -1239,14 +1242,14 @@ void build_ghost( size_t gid = output_mesh.local_to_global_node_mapping.host(i); auto it = gid_to_coord.find(gid); if (it != gid_to_coord.end()) { - output_node.coords.host(i,0) = it->second[0]; - output_node.coords.host(i,1) = it->second[1]; - output_node.coords.host(i,2) = it->second[2]; + for (int dim = 0; dim < num_dim; dim++) { + output_node.coords.host(i,dim) = it->second[dim]; + } } else { // Could happen if there's a bug: fill with zeros for safety - output_node.coords.host(i,0) = 0.0; - output_node.coords.host(i,1) = 0.0; - output_node.coords.host(i,2) = 0.0; + for (int dim = 0; dim < num_dim; dim++) { + output_node.coords.host(i,dim) = 0.0; + } } } output_node.coords.update_device(); @@ -1314,6 +1317,7 @@ void build_ghost( } MPI_Barrier(MPI_COMM_WORLD); + if(rank == 0) std::cout<<"After boundary_elem_targets"< boundary_elem_local_ids; @@ -1354,7 +1358,7 @@ void build_ghost( MPI_Barrier(MPI_COMM_WORLD); output_mesh.num_boundary_elems = boundary_elem_local_ids.size(); - output_mesh.boundary_elem_local_ids = DCArrayKokkos(output_mesh.num_boundary_elems); + output_mesh.boundary_elem_local_ids = DCArrayKokkos(output_mesh.num_boundary_elems, "boundary_elem_local_ids"); for (int i = 0; i < output_mesh.num_boundary_elems; i++) { output_mesh.boundary_elem_local_ids.host(i) = boundary_elem_local_ids[i]; } @@ -1421,7 +1425,7 @@ void build_ghost( // sourceweights: Weights on incoming edges (not used here, set to MPI_UNWEIGHTED) // Could be used to specify communication volume if needed for optimization - int* sourceweights = MPI_UNWEIGHTED; + // int* sourceweights = MPI_UNWEIGHTED; // ---------- Prepare OUTGOING edges (destinations) ---------- // outdegree: Number of ranks to which this rank will SEND data @@ -1448,7 +1452,7 @@ void build_ghost( int* node_sources = (node_indegree > 0) ? ghost_node_receive_ranks_vec.data() : MPI_UNWEIGHTED; // sourceweights: Weights on incoming edges (not used here, set to MPI_UNWEIGHTED) - int* node_sourceweights = MPI_UNWEIGHTED; + //int* node_sourceweights = MPI_UNWEIGHTED; // ---------- Prepare OUTGOING edges (destinations) ---------- // outdegree: Number of ranks to which this rank will SEND data @@ -1457,11 +1461,12 @@ void build_ghost( int* node_destinations = (node_outdegree > 0) ? ghost_node_send_ranks_vec.data() : MPI_UNWEIGHTED; // destinationweights: Weights on outgoing edges (not used here, set to MPI_UNWEIGHTED) - int* node_destinationweights = MPI_UNWEIGHTED; + // int* node_destinationweights = MPI_UNWEIGHTED; // Initialize the graph communicator for node communication node_communication_plan.initialize_graph_communicator(node_outdegree, node_destinations, node_indegree, node_sources); MPI_Barrier(MPI_COMM_WORLD); + if (rank == 0) std::cout<<"After node graph communicator"< strides_array(element_communication_plan.num_send_ranks); + DCArrayKokkos strides_array(element_communication_plan.num_send_ranks, "strides_for_elems_to_send"); for (int i = 0; i < element_communication_plan.num_send_ranks; i++) { int dest_rank = element_communication_plan.send_rank_ids.host(i); - strides_array(i) = elems_to_send_by_rank[dest_rank].size(); + strides_array.host(i) = elems_to_send_by_rank[dest_rank].size(); } + strides_array.update_device(); DRaggedRightArrayKokkos elems_to_send_by_rank_rr(strides_array, "elems_to_send_by_rank"); // Fill in the data @@ -1517,12 +1523,13 @@ void build_ghost( } // ========== Serialize into a DRaggedRightArrayKokkos ========== - CArrayKokkos elem_recv_strides_array(element_communication_plan.num_recv_ranks); + DCArrayKokkos elem_recv_strides_array(element_communication_plan.num_recv_ranks, "elem_recv_strides_array"); for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) { int source_rank = element_communication_plan.recv_rank_ids.host(i); - elem_recv_strides_array(i) = elems_to_recv_by_rank[source_rank].size(); + elem_recv_strides_array.host(i) = elems_to_recv_by_rank[source_rank].size(); } + elem_recv_strides_array.update_device(); DRaggedRightArrayKokkos elems_to_recv_by_rank_rr(elem_recv_strides_array, "elems_to_recv_by_rank"); // Fill in the data for (int i = 0; i < element_communication_plan.num_recv_ranks; i++) { @@ -1532,6 +1539,7 @@ void build_ghost( } } elems_to_recv_by_rank_rr.update_device(); + MATAR_FENCE(); element_communication_plan.setup_send_recv(elems_to_send_by_rank_rr, elems_to_recv_by_rank_rr); MPI_Barrier(MPI_COMM_WORLD); @@ -1547,11 +1555,12 @@ void build_ghost( // -------------------------------------------------------------------------------------- // Serialize into a DRaggedRightArrayKokkos - CArrayKokkos node_send_strides_array(node_communication_plan.num_send_ranks); + DCArrayKokkos node_send_strides_array(node_communication_plan.num_send_ranks,"node_send_strides_array"); for (int i = 0; i < node_communication_plan.num_send_ranks; i++) { int dest_rank = node_communication_plan.send_rank_ids.host(i); - node_send_strides_array(i) = nodes_to_send_by_rank[dest_rank].size(); + node_send_strides_array.host(i) = nodes_to_send_by_rank[dest_rank].size(); } + node_send_strides_array.update_device(); DRaggedRightArrayKokkos nodes_to_send_by_rank_rr(node_send_strides_array, "nodes_to_send_by_rank"); // Fill in the data @@ -1616,11 +1625,12 @@ void build_ghost( } // Serialize into a DRaggedRightArrayKokkos - CArrayKokkos nodes_recv_strides_array(node_communication_plan.num_recv_ranks); + DCArrayKokkos nodes_recv_strides_array(node_communication_plan.num_recv_ranks, "nodes_recv_strides_array"); for (int i = 0; i < node_communication_plan.num_recv_ranks; i++) { int source_rank = node_communication_plan.recv_rank_ids.host(i); - nodes_recv_strides_array(i) = nodes_to_recv_by_rank[source_rank].size(); + nodes_recv_strides_array.host(i) = nodes_to_recv_by_rank[source_rank].size(); } + nodes_recv_strides_array.update_device(); DRaggedRightArrayKokkos nodes_to_recv_by_rank_rr(nodes_recv_strides_array, "nodes_to_recv_by_rank"); // Fill in the data for (int i = 0; i < node_communication_plan.num_recv_ranks; i++) { @@ -1681,7 +1691,7 @@ void partition_mesh( int rank){ bool print_info = false; - bool print_vtk = false; + // bool print_vtk = false; int num_dim = initial_mesh.num_dims; @@ -1809,7 +1819,7 @@ void partition_mesh( for (size_t k = 0; k < naive_mesh.num_elems; k++) { int elem_gid_on_rank = naive_mesh.local_to_global_elem_mapping.host(k); elem_gid_to_offset[elem_gid_on_rank] = current_offset; - current_offset += num_elems_in_elem_per_rank(k); + current_offset += num_elems_in_elem_per_rank.host(k); } // --- Step 3: Fill in the CSR arrays, looping over each locally-owned element --- @@ -1836,11 +1846,11 @@ void partition_mesh( break; } } - size_t num_nbrs = num_elems_in_elem_per_rank(idx); + size_t num_nbrs = num_elems_in_elem_per_rank.host(idx); // Append each neighbor (by its GLOBAL elem GID) to edgeloctab for (size_t j = 0; j < num_nbrs; j++) { - size_t neighbor_gid = elems_in_elem_on_rank(elems_in_elem_offset + j); // This is a global element ID! + size_t neighbor_gid = elems_in_elem_on_rank.host(elems_in_elem_offset + j); // This is a global element ID! edgeloctab.push_back(static_cast(neighbor_gid)); ++offset; // Increment running edge count } @@ -2206,8 +2216,8 @@ void partition_mesh( // -------------- Phase 6: Build the intermediate_mesh -------------- intermediate_mesh.initialize_nodes(num_new_nodes); intermediate_mesh.initialize_elems(num_new_elems, naive_mesh.num_dims); - intermediate_mesh.local_to_global_node_mapping = DCArrayKokkos(num_new_nodes); - intermediate_mesh.local_to_global_elem_mapping = DCArrayKokkos(num_new_elems); + intermediate_mesh.local_to_global_node_mapping = DCArrayKokkos(num_new_nodes, "intermediate_mesh.local_to_global_node_mapping"); + intermediate_mesh.local_to_global_elem_mapping = DCArrayKokkos(num_new_elems, "intermediate_mesh.local_to_global_elem_mapping"); // Fill global mappings for (int i = 0; i < num_new_nodes; i++) @@ -2315,10 +2325,10 @@ void partition_mesh( gauss_point.fields_vec.host(i, 2) = static_cast(rank); } for (int i = final_mesh.num_owned_elems; i < final_mesh.num_elems; i++) { - gauss_point.fields.host(i) = -100.0; // Ghost elements should be updated - gauss_point.fields_vec.host(i, 0) = -100.0; - gauss_point.fields_vec.host(i, 1) = -100.0; - gauss_point.fields_vec.host(i, 2) = -100.0; + gauss_point.fields.host(i) = -1.0; // Ghost elements should be updated + gauss_point.fields_vec.host(i, 0) = -1.0; + gauss_point.fields_vec.host(i, 1) = -1.0; + gauss_point.fields_vec.host(i, 2) = -1.0; } gauss_point.fields.update_device(); gauss_point.fields_vec.update_device(); @@ -2327,32 +2337,25 @@ void partition_mesh( gauss_point.fields_vec.communicate(); // Loop over all elements and average the values of elements connected to that element - - CArrayKokkos tmp_store(final_mesh.num_elems); - FOR_ALL(i, 0, final_mesh.num_owned_elems, { + FOR_ALL(i, 0, final_mesh.num_elems, { double value = 0.0; for (int j = 0; j < final_mesh.num_elems_in_elem(i); j++) { - value += gauss_point.fields.host(final_mesh.elems_in_elem(i, j)); + value += gauss_point.fields(final_mesh.elems_in_elem(i, j)); } value /= final_mesh.num_elems_in_elem(i); - tmp_store(i) = value; - // gauss_point.fields(i) = value; + gauss_point.fields(i) = value; value = 0.0; for (int j = 0; j < final_mesh.num_elems_in_elem(i); j++) { - value += gauss_point.fields_vec.host(final_mesh.elems_in_elem(i, j), 0); + value += gauss_point.fields_vec(final_mesh.elems_in_elem(i, j), 0); } value /= final_mesh.num_elems_in_elem(i); - gauss_point.fields_vec.host(i, 0) = value; - gauss_point.fields_vec.host(i, 1) = value; - gauss_point.fields_vec.host(i, 2) = value; + gauss_point.fields_vec(i, 0) = value; + gauss_point.fields_vec(i, 1) = value; + gauss_point.fields_vec(i, 2) = value; }); - - FOR_ALL(i, 0, final_mesh.num_owned_elems, { - gauss_point.fields(i) = tmp_store(i); - }); - - gauss_point.fields_vec.update_device(); + gauss_point.fields.update_host(); + gauss_point.fields_vec.update_host(); @@ -2367,10 +2370,10 @@ void partition_mesh( final_node.vector_field.host(i, 2) = static_cast(rank); } for (int i = final_mesh.num_owned_nodes; i < final_mesh.num_nodes; i++) { - final_node.scalar_field.host(i) = -100.0; - final_node.vector_field.host(i, 0) = -100.0; - final_node.vector_field.host(i, 1) = -100.0; - final_node.vector_field.host(i, 2) = -100.0; + final_node.scalar_field.host(i) = -1.0; + final_node.vector_field.host(i, 0) = -1.0; + final_node.vector_field.host(i, 1) = -1.0; + final_node.vector_field.host(i, 2) = -1.0; } final_node.coords.update_device(); @@ -2395,7 +2398,7 @@ void partition_mesh( final_node.scalar_field(final_mesh.nodes_in_elem(i, j)) = value; } }); - + final_node.scalar_field.update_host(); } #endif // DECOMP_UTILS_H \ No newline at end of file From c7f500ad07ec334312af912f66cd9a12f822a824 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Mon, 24 Nov 2025 16:06:03 -0600 Subject: [PATCH 46/52] BUG: Chasing cuda bug still --- examples/mesh_decomp/decomp_utils.h | 11 ++++++++++- examples/mesh_decomp/mesh_io.h | 4 +--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h index d43aa497..118e4fc5 100644 --- a/examples/mesh_decomp/decomp_utils.h +++ b/examples/mesh_decomp/decomp_utils.h @@ -2335,6 +2335,8 @@ void partition_mesh( gauss_point.fields.communicate(); gauss_point.fields_vec.communicate(); + + CArrayKokkos tmp(final_mesh.num_elems); // Loop over all elements and average the values of elements connected to that element FOR_ALL(i, 0, final_mesh.num_elems, { @@ -2343,7 +2345,9 @@ void partition_mesh( value += gauss_point.fields(final_mesh.elems_in_elem(i, j)); } value /= final_mesh.num_elems_in_elem(i); - gauss_point.fields(i) = value; + + tmp(i) = value; + value = 0.0; for (int j = 0; j < final_mesh.num_elems_in_elem(i); j++) { @@ -2354,6 +2358,11 @@ void partition_mesh( gauss_point.fields_vec(i, 1) = value; gauss_point.fields_vec(i, 2) = value; }); + + FOR_ALL(i, 0, final_mesh.num_elems, { + gauss_point.fields(i) = tmp(i); + }); + gauss_point.fields.update_host(); gauss_point.fields_vec.update_host(); diff --git a/examples/mesh_decomp/mesh_io.h b/examples/mesh_decomp/mesh_io.h index 04c2cad1..79eec569 100644 --- a/examples/mesh_decomp/mesh_io.h +++ b/examples/mesh_decomp/mesh_io.h @@ -354,9 +354,6 @@ void build_3d_box( point_scalar_fields(node_gid, 1) = (double)mesh.num_corners_in_node(node_gid); point_scalar_fields(node_gid, 2) = node.scalar_field.host(node_gid); - if(node_gid == 0) { - std::cout << "*******[rank " << rank << "] - num_corners_in_node: " << mesh.num_corners_in_node(node_gid) << std::endl; - } } // end for loop over vertices @@ -588,6 +585,7 @@ void write_vtu(Mesh_t& mesh, }); MATAR_FENCE(); num_elems_in_elem.update_host(); + MATAR_FENCE(); for (size_t elem_gid = 0; elem_gid < num_elems; elem_gid++) { elem_fields(elem_gid, 0) = rank; From 1953d5d047239bc974c20edc4be13a82516e5a3f Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Mon, 24 Nov 2025 17:14:42 -0600 Subject: [PATCH 47/52] BUG: Chasing cuda+MPI bug --- examples/mesh_decomp/decomp_utils.h | 62 ++++++++++++++++++++++++----- src/include/communication_plan.h | 2 + src/include/mpi_types.h | 2 +- 3 files changed, 55 insertions(+), 11 deletions(-) diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h index 118e4fc5..ac88fc37 100644 --- a/examples/mesh_decomp/decomp_utils.h +++ b/examples/mesh_decomp/decomp_utils.h @@ -295,6 +295,7 @@ void naive_partition_mesh( dim, 0, num_dim,{ naive_node.coords(node_id, dim) = node_pos_on_rank(node_id, dim); }); + MATAR_FENCE(); naive_node.coords.update_host(); @@ -433,16 +434,18 @@ void naive_partition_mesh( tmp_elems_in_elem(elem_gid, i) = initial_mesh.elems_in_elem(elem_gid, i); } // end for i }); // end FOR_ALL elems + MATAR_FENCE(); tmp_elems_in_elem.update_host(); - Kokkos::fence(); + DCArrayKokkos tmp_num_elems_in_elem(initial_mesh.num_elems, "tmp_elems_in_elem"); FOR_ALL(i, 0, initial_mesh.num_elems, { tmp_num_elems_in_elem(i) = initial_mesh.num_elems_in_elem(i); }); - tmp_num_elems_in_elem.update_host(); MATAR_FENCE(); + tmp_num_elems_in_elem.update_host(); + for(int i = 0; i < world_size; i++) { sendcounts[i] = elem_elem_counts[i]; @@ -2326,16 +2329,20 @@ void partition_mesh( } for (int i = final_mesh.num_owned_elems; i < final_mesh.num_elems; i++) { gauss_point.fields.host(i) = -1.0; // Ghost elements should be updated - gauss_point.fields_vec.host(i, 0) = -1.0; - gauss_point.fields_vec.host(i, 1) = -1.0; - gauss_point.fields_vec.host(i, 2) = -1.0; + gauss_point.fields_vec.host(i, 0) = -100.0; + gauss_point.fields_vec.host(i, 1) = -100.0; + gauss_point.fields_vec.host(i, 2) = -100.0; } gauss_point.fields.update_device(); gauss_point.fields_vec.update_device(); + + MPI_Barrier(MPI_COMM_WORLD); gauss_point.fields.communicate(); gauss_point.fields_vec.communicate(); + MPI_Barrier(MPI_COMM_WORLD); + CArrayKokkos tmp(final_mesh.num_elems); // Loop over all elements and average the values of elements connected to that element @@ -2358,10 +2365,12 @@ void partition_mesh( gauss_point.fields_vec(i, 1) = value; gauss_point.fields_vec(i, 2) = value; }); + MATAR_FENCE(); FOR_ALL(i, 0, final_mesh.num_elems, { gauss_point.fields(i) = tmp(i); }); + MATAR_FENCE(); gauss_point.fields.update_host(); gauss_point.fields_vec.update_host(); @@ -2371,6 +2380,19 @@ void partition_mesh( // Test node communication using MPI_Neighbor_alltoallv std::vector node_states = {node_state::coords, node_state::scalar_field, node_state::vector_field}; final_node.initialize(final_mesh.num_nodes, 3, node_states, node_communication_plan); + + for (int r = 0; r < world_size; r++) { + MPI_Barrier(MPI_COMM_WORLD); + if (rank == r) { + std::cout << "[rank " << rank << "] Finished building extended mesh structure" << std::endl; + std::cout << "[rank " << rank << "] - Owned elements: " << final_mesh.num_owned_elems << std::endl; + std::cout << "[rank " << rank << "] - Ghost elements: " << final_mesh.num_elems - final_mesh.num_owned_elems << std::endl; + std::cout << "[rank " << rank << "] - Owned nodes: " << final_mesh.num_owned_nodes << std::endl; + std::cout << "[rank " << rank << "] - Ghost-only nodes: " << final_mesh.num_nodes - final_mesh.num_owned_nodes << std::endl; + std::cout << std::flush; + } + MPI_Barrier(MPI_COMM_WORLD); + } for (int i = 0; i < final_mesh.num_owned_nodes; i++) { final_node.scalar_field.host(i) = static_cast(rank); @@ -2379,15 +2401,19 @@ void partition_mesh( final_node.vector_field.host(i, 2) = static_cast(rank); } for (int i = final_mesh.num_owned_nodes; i < final_mesh.num_nodes; i++) { - final_node.scalar_field.host(i) = -1.0; - final_node.vector_field.host(i, 0) = -1.0; - final_node.vector_field.host(i, 1) = -1.0; - final_node.vector_field.host(i, 2) = -1.0; + final_node.scalar_field.host(i) = -100.0; + final_node.vector_field.host(i, 0) = -100.0; + final_node.vector_field.host(i, 1) = -100.0; + final_node.vector_field.host(i, 2) = -100.0; } final_node.coords.update_device(); final_node.scalar_field.update_device(); final_node.vector_field.update_device(); + MATAR_FENCE(); + MPI_Barrier(MPI_COMM_WORLD); + + node_communication_plan.verify_graph_communicator(); final_node.scalar_field.communicate(); // final_node.vector_field.communicate(); @@ -2395,6 +2421,8 @@ void partition_mesh( // Update scalar field to visualize the communication + + CArrayKokkos tmp_too(final_mesh.num_elems); FOR_ALL(i, 0, final_mesh.num_elems, { double value = 0.0; @@ -2402,12 +2430,26 @@ void partition_mesh( value += final_node.scalar_field(final_mesh.nodes_in_elem(i, j)); } value /= final_mesh.num_nodes_in_elem; + tmp_too(i) = value; + }); + MATAR_FENCE(); + FOR_ALL(i, 0, final_mesh.num_elems, { for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) { - final_node.scalar_field(final_mesh.nodes_in_elem(i, j)) = value; + final_node.scalar_field(final_mesh.nodes_in_elem(i, j)) = tmp_too(i); } }); + MATAR_FENCE(); + + MPI_Barrier(MPI_COMM_WORLD); + + if(rank == 0)std::cout<<"Print from rank 0"<send_rank_ids.host(i) = send_rank_ids[i]; } this->send_rank_ids.update_device(); + MATAR_FENCE(); // Copy and store receive neighbor IDs (in-bound neighbors: where we will receive data from) this->recv_rank_ids = DCArrayKokkos(num_recv_ranks, "recv_rank_ids"); @@ -197,6 +198,7 @@ enum class communication_plan_type { this->recv_rank_ids.host(i) = recv_rank_ids[i]; } this->recv_rank_ids.update_device(); + MATAR_FENCE(); // Create the distributed graph communicator. // This call links this process to its explicit send and receive neighbors. diff --git a/src/include/mpi_types.h b/src/include/mpi_types.h index b0999049..4546fd48 100644 --- a/src/include/mpi_types.h +++ b/src/include/mpi_types.h @@ -303,7 +303,7 @@ class MPICArrayKokkos { void communicate(){ this_array_.update_host(); - + MATAR_FENCE(); fill_send_buffer(); MPI_Neighbor_alltoallv( From 1a3e45a427647e93d8a4c2382601ae5b2b8c69ca Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Tue, 25 Nov 2025 13:24:11 -0600 Subject: [PATCH 48/52] ENH: CUDA builds working --- examples/mesh_decomp/decomp_utils.h | 97 ++++++++++++---------------- examples/mesh_decomp/mesh_decomp.cpp | 2 +- examples/mesh_decomp/mesh_io.h | 2 - src/include/mpi_types.h | 3 +- 4 files changed, 44 insertions(+), 60 deletions(-) diff --git a/examples/mesh_decomp/decomp_utils.h b/examples/mesh_decomp/decomp_utils.h index ac88fc37..24c75d46 100644 --- a/examples/mesh_decomp/decomp_utils.h +++ b/examples/mesh_decomp/decomp_utils.h @@ -2261,31 +2261,31 @@ void partition_mesh( // Fill node coordinates // coord_recvbuf contains coords in element-node order, but we need them in node order // Build a map from node GID to coordinates - std::map> node_gid_to_coords; + std::map> node_gid_to_coords; int coord_idx = 0; - for (int e = 0; e < intermediate_mesh.num_elems; ++e) { + for (int e = 0; e < intermediate_mesh.num_elems; e++) { for (int j = 0; j < intermediate_mesh.num_nodes_in_elem; j++) { int node_gid = conn_recvbuf[e * intermediate_mesh.num_nodes_in_elem + j]; if (node_gid_to_coords.find(node_gid) == node_gid_to_coords.end()) { - node_gid_to_coords[node_gid] = { - coord_recvbuf[coord_idx*3 + 0], - coord_recvbuf[coord_idx*3 + 1], - coord_recvbuf[coord_idx*3 + 2] - }; + std::vector coords(num_dim); + for (int d = 0; d < num_dim; d++) { + coords[d] = coord_recvbuf[coord_idx * num_dim + d]; + } + node_gid_to_coords[node_gid] = coords; } coord_idx++; } } // Now fill coordinates in node order - intermediate_node.initialize(num_new_nodes, 3, {node_state::coords}); + intermediate_node.initialize(num_new_nodes, num_dim, {node_state::coords}); for (int i = 0; i < num_new_nodes; i++) { int node_gid = new_node_gids[i]; auto it = node_gid_to_coords.find(node_gid); if (it != node_gid_to_coords.end()) { - intermediate_node.coords.host(i, 0) = it->second[0]; - intermediate_node.coords.host(i, 1) = it->second[1]; - intermediate_node.coords.host(i, 2) = it->second[2]; + for (int d = 0; d < num_dim; d++) { + intermediate_node.coords.host(i, d) = it->second[d]; + } } } intermediate_node.coords.update_device(); @@ -2361,6 +2361,7 @@ void partition_mesh( value += gauss_point.fields_vec(final_mesh.elems_in_elem(i, j), 0); } value /= final_mesh.num_elems_in_elem(i); + gauss_point.fields_vec(i, 0) = value; gauss_point.fields_vec(i, 1) = value; gauss_point.fields_vec(i, 2) = value; @@ -2380,31 +2381,18 @@ void partition_mesh( // Test node communication using MPI_Neighbor_alltoallv std::vector node_states = {node_state::coords, node_state::scalar_field, node_state::vector_field}; final_node.initialize(final_mesh.num_nodes, 3, node_states, node_communication_plan); - - for (int r = 0; r < world_size; r++) { - MPI_Barrier(MPI_COMM_WORLD); - if (rank == r) { - std::cout << "[rank " << rank << "] Finished building extended mesh structure" << std::endl; - std::cout << "[rank " << rank << "] - Owned elements: " << final_mesh.num_owned_elems << std::endl; - std::cout << "[rank " << rank << "] - Ghost elements: " << final_mesh.num_elems - final_mesh.num_owned_elems << std::endl; - std::cout << "[rank " << rank << "] - Owned nodes: " << final_mesh.num_owned_nodes << std::endl; - std::cout << "[rank " << rank << "] - Ghost-only nodes: " << final_mesh.num_nodes - final_mesh.num_owned_nodes << std::endl; - std::cout << std::flush; - } - MPI_Barrier(MPI_COMM_WORLD); - } for (int i = 0; i < final_mesh.num_owned_nodes; i++) { final_node.scalar_field.host(i) = static_cast(rank); - final_node.vector_field.host(i, 0) = static_cast(rank); - final_node.vector_field.host(i, 1) = static_cast(rank); - final_node.vector_field.host(i, 2) = static_cast(rank); + for(int dim = 0; dim < num_dim; dim++){ + final_node.vector_field.host(i, dim) = static_cast(rank); + } } for (int i = final_mesh.num_owned_nodes; i < final_mesh.num_nodes; i++) { final_node.scalar_field.host(i) = -100.0; - final_node.vector_field.host(i, 0) = -100.0; - final_node.vector_field.host(i, 1) = -100.0; - final_node.vector_field.host(i, 2) = -100.0; + for(int dim = 0; dim < num_dim; dim++){ + final_node.vector_field.host(i, dim) = -100; + } } final_node.coords.update_device(); @@ -2416,38 +2404,35 @@ void partition_mesh( node_communication_plan.verify_graph_communicator(); final_node.scalar_field.communicate(); - // final_node.vector_field.communicate(); - MPI_Barrier(MPI_COMM_WORLD); - - - // Update scalar field to visualize the communication - - CArrayKokkos tmp_too(final_mesh.num_elems); - FOR_ALL(i, 0, final_mesh.num_elems, { - - double value = 0.0; - for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) { - value += final_node.scalar_field(final_mesh.nodes_in_elem(i, j)); - } - value /= final_mesh.num_nodes_in_elem; - tmp_too(i) = value; - }); + final_node.vector_field.communicate(); + MATAR_FENCE(); + MPI_Barrier(MPI_COMM_WORLD); - FOR_ALL(i, 0, final_mesh.num_elems, { - for(int j = 0; j < final_mesh.num_nodes_in_elem; j++) { - final_node.scalar_field(final_mesh.nodes_in_elem(i, j)) = tmp_too(i); - } - }); - MATAR_FENCE(); + DCArrayKokkos tmp_too(final_mesh.num_nodes); + for(int smooth = 0; smooth < 3; smooth++){ + FOR_ALL(i, 0, final_mesh.num_nodes, { - MPI_Barrier(MPI_COMM_WORLD); + double value = final_node.scalar_field(i); + for(int j = 0; j < final_mesh.num_nodes_in_node(i); j++){ + value += final_node.scalar_field(final_mesh.nodes_in_node(i, j)); + } + value /= final_mesh.num_nodes_in_node(i) + 1; + tmp_too(i) = value; + }); + MATAR_FENCE(); - if(rank == 0)std::cout<<"Print from rank 0"<::value(), // MPI_TYPE comm_plan_->mpi_comm_graph); - + MATAR_FENCE(); copy_recv_buffer(); + MATAR_FENCE(); this_array_.update_device(); }; From 614bf4cd0d7fe6b5874a1ab466009718fc582a6f Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Tue, 25 Nov 2025 14:49:48 -0600 Subject: [PATCH 49/52] ENH: Tidying up --- src/include/communication_plan.h | 23 +++------ src/include/mpi_types.h | 86 ++++++++++++++++++++++++-------- 2 files changed, 71 insertions(+), 38 deletions(-) diff --git a/src/include/communication_plan.h b/src/include/communication_plan.h index d0140d43..f6613c10 100644 --- a/src/include/communication_plan.h +++ b/src/include/communication_plan.h @@ -9,25 +9,14 @@ using namespace mtr; -/** - * @struct CommunicationPlan - * @brief Manages efficient MPI communication for ghost element and node data exchange - * - * Pure data-oriented design with only flat, contiguous arrays for maximum cache efficiency. - * Designed to be embedded in distributed data structures for automatic ghost synchronization. - * - * Usage pattern in distributed structures: - * node.velocity.comm() -> automatically syncs ghost nodes - * elem.density.comm() -> automatically syncs ghost elements - * - */ + enum class communication_plan_type { no_communication, all_to_all_graph }; - struct CommunicationPlan { +struct CommunicationPlan { // ======================================================================== // Metadata for MPI neighbor graph communication @@ -220,6 +209,7 @@ enum class communication_plan_type { has_comm_graph = true; } + // Useful function for debugging, possibly remove void verify_graph_communicator(){ if(!has_comm_graph){ throw std::runtime_error("MPI graph communicator has not been initialized"); @@ -320,6 +310,7 @@ enum class communication_plan_type { MPI_Barrier(mpi_comm_world); } + // Setup send/receive metadata void setup_send_recv(DRaggedRightArrayKokkos &rank_send_ids, DRaggedRightArrayKokkos &rank_recv_ids){ this->send_indices_ = rank_send_ids; // indices of element data to send to each rank @@ -360,10 +351,10 @@ enum class communication_plan_type { } } this->recv_displs_.update_device(); - - MPI_Barrier(mpi_comm_world); + MATAR_FENCE(); } + // Useful function for debugging, possibly remove void verify_send_recv(){ if(!has_comm_graph){ @@ -511,8 +502,6 @@ enum class communication_plan_type { throw std::runtime_error("Send/Recv communication plan verification failed"); } } - - }; // End of CommunicationPlan #endif // end if HAVE_MPI diff --git a/src/include/mpi_types.h b/src/include/mpi_types.h index aa744678..5f83265b 100644 --- a/src/include/mpi_types.h +++ b/src/include/mpi_types.h @@ -83,9 +83,9 @@ class MPICArrayKokkos { DCArrayKokkos recv_buffer_; protected: - size_t dims_[7]; - size_t length_; - size_t order_; // tensor order (rank) + size_t dims_[7] = {0,0,0,0,0,0,0}; + size_t length_ = 0; + size_t order_ = 0; // tensor order (rank) MPI_Comm mpi_comm_; MPI_Status mpi_status_; @@ -94,7 +94,7 @@ class MPICArrayKokkos { // --- Ghost Communication Support --- - CommunicationPlan* comm_plan_; // Pointer to shared communication plan + CommunicationPlan* comm_plan_ = NULL; // Pointer to shared communication plan DCArrayKokkos send_counts_; // [size: num_send_ranks] Number of items to send to each rank @@ -113,7 +113,7 @@ class MPICArrayKokkos { size_t num_ghost_; // Number of ghost items (nodes/elements) public: - // Data member to access host view + // Data member to access host view (initialized as pointer to this_array_.host_pointer()) ViewCArray host; @@ -143,8 +143,6 @@ class MPICArrayKokkos { size_t dim3, size_t dim4, size_t dim5, size_t dim6, const std::string& tag_string = DEFAULTSTRINGARRAY); - - KOKKOS_INLINE_FUNCTION T& operator()(size_t i) const; @@ -254,6 +252,10 @@ class MPICArrayKokkos { // Such that all the boundary elements going to a given rank are contiguous in the send buffer. void fill_send_buffer(){ + // Copy this_array_ to the host + this_array_.update_host(); + MATAR_FENCE(); + size_t send_idx = 0; for(int i = 0; i < comm_plan_->num_send_ranks; i++){ for(int j = 0; j < comm_plan_->send_counts_.host(i); j++){ @@ -284,7 +286,6 @@ class MPICArrayKokkos { recv_idx += stride_; } } - this_array_.update_device(); }; @@ -300,10 +301,25 @@ class MPICArrayKokkos { // Method that communicates the data between the ranks // NOTE: This is a blocking communication operation, // if you want to use non-blocking communication, you can use the following: MPI_Ineighbor_alltoallv + + // TODO: Replace this with persistent communicator: + // MPI_Request req; + + // // Create persistent operation ONCE + // MPI_Neighbor_alltoallv_init( + // sendbuf, sendcounts, sdispls, mpi_type_map::value(), + // recvbuf, recvcounts, rdispls, mpi_type_map::value(), + // comm_plan_->mpi_comm_graph, + // MPI_INFO_NULL, + // &req); + + // // Then inside time step loop: + // MPI_Start(&req); + // // modify sendbuf in-place as needed + // MPI_Wait(&req); + void communicate(){ - this_array_.update_host(); - MATAR_FENCE(); fill_send_buffer(); MPI_Neighbor_alltoallv( @@ -316,11 +332,10 @@ class MPICArrayKokkos { recv_displs_.host_pointer(), mpi_type_map::value(), // MPI_TYPE comm_plan_->mpi_comm_graph); - MATAR_FENCE(); + copy_recv_buffer(); - MATAR_FENCE(); - this_array_.update_device(); + MATAR_FENCE(); }; void set_values(const T& value){ @@ -339,7 +354,7 @@ MPICArrayKokkos::MPICArrayKokkos() for (int i = 0; i < 7; i++) { dims_[i] = 0; } - } +} // Overloaded 1D constructor template @@ -507,12 +522,41 @@ T& MPICArrayKokkos::operator()(size_t i, size_t template KOKKOS_INLINE_FUNCTION MPICArrayKokkos& MPICArrayKokkos::operator=(const MPICArrayKokkos& temp) { - this_array_ = temp.this_array_; - host = temp.host; // Also copy the host ViewCArray - comm_plan_ = temp.comm_plan_; - send_buffer_ = temp.send_buffer_; - recv_buffer_ = temp.recv_buffer_; - stride_ = temp.stride_; + + // Do nothing if the assignment is of the form x = x + if (this != &temp) { + + this_array_ = temp.this_array_; + send_buffer_ = temp.send_buffer_; + recv_buffer_ = temp.recv_buffer_; + + length_ = temp.length_; + + for (int iter = 0; iter < temp.order_; iter++){ + dims_[iter] = temp.dims_[iter]; + } // end for + + order_ = temp.order_; + + mpi_status_ = temp.mpi_status_; + mpi_datatype_ = temp.mpi_datatype_; + mpi_request_ = temp.mpi_request_; + comm_plan_ = temp.comm_plan_; + + send_counts_ = temp.send_counts_; + recv_counts_ = temp.recv_counts_; + send_displs_ = temp.send_displs_; + recv_displs_ = temp.recv_displs_; + stride_ = temp.stride_; + + send_indices_ = temp.send_indices_; + recv_indices_ = temp.recv_indices_; + + num_owned_ = temp.num_owned_; + num_ghost_ = temp.num_ghost_; + + host = temp.host; // Also copy the host ViewCArray + } return *this; } @@ -533,7 +577,7 @@ template ::dims(size_t i) const { assert(i < order_ && "MPICArrayKokkos order (rank) does not match constructor, dim[i] does not exist!"); - assert(dims_[i]>0 && "Access to MPICArrayKokkos dims is out of bounds!"); + assert(dims_[i] > 0 && "Access to MPICArrayKokkos dims is out of bounds!"); return this_array_.dims(i); } From 6156a6bad82a527c7dc7af32962a0af4015f216f Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Tue, 25 Nov 2025 15:08:58 -0600 Subject: [PATCH 50/52] BUG: Correct default build script behavior --- scripts/build-matar.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/build-matar.sh b/scripts/build-matar.sh index fa95bc12..30f384dc 100755 --- a/scripts/build-matar.sh +++ b/scripts/build-matar.sh @@ -71,7 +71,7 @@ show_help() { build_action="full-app" execution="examples" machine="linux" -kokkos_build_type="openmp" +kokkos_build_type="serial" build_cores="1" trilinos="disabled" intel_mkl="disabled" From 7dee53afc22395b4e012b50b0d4caa2594e6af26 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Tue, 25 Nov 2025 15:17:50 -0600 Subject: [PATCH 51/52] ENH: Update minimum cmake version --- benchmark/CMakeLists.txt | 2 +- examples/gArrayofgArrays/CMakeLists.txt | 2 +- examples/halfspace_cooling/CMakeLists.txt | 2 +- examples/laplace/CMakeLists.txt | 2 +- examples/laplaceMPI/CMakeLists.txt | 2 +- examples/mesh_decomp/CMakeLists.txt | 2 +- examples/phaseField/srcKokkosVerbose/CMakeLists.txt | 2 +- examples/phaseField/srcMacros/CMakeLists.txt | 2 +- examples/phaseFieldMPI/CMakeLists.txt | 2 +- examples/sparsetests/CMakeLists.txt | 2 +- examples/test_rocm/CMakeLists.txt | 2 +- examples/virtualFcnKokkos/CMakeLists.txt | 2 +- examples/virtualFcnMATAR/CMakeLists.txt | 2 +- examples/watt-graph/CMakeLists.txt | 2 +- test/test_cases/CMakeLists.txt | 2 +- 15 files changed, 15 insertions(+), 15 deletions(-) diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 372ad21c..0a548973 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.1.3) +cmake_minimum_required(VERSION 3.5) project (matarbenchmark) diff --git a/examples/gArrayofgArrays/CMakeLists.txt b/examples/gArrayofgArrays/CMakeLists.txt index 33a5fa97..e90dd1da 100644 --- a/examples/gArrayofgArrays/CMakeLists.txt +++ b/examples/gArrayofgArrays/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.1.3) +cmake_minimum_required(VERSION 3.5) find_package(Matar REQUIRED) diff --git a/examples/halfspace_cooling/CMakeLists.txt b/examples/halfspace_cooling/CMakeLists.txt index dbcaa6f9..91bffb75 100644 --- a/examples/halfspace_cooling/CMakeLists.txt +++ b/examples/halfspace_cooling/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.1.3) +cmake_minimum_required(VERSION 3.5) find_package(Matar REQUIRED) diff --git a/examples/laplace/CMakeLists.txt b/examples/laplace/CMakeLists.txt index acbd4a1f..b3122cd0 100644 --- a/examples/laplace/CMakeLists.txt +++ b/examples/laplace/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.1.3) +cmake_minimum_required(VERSION 3.5) find_package(Matar REQUIRED) diff --git a/examples/laplaceMPI/CMakeLists.txt b/examples/laplaceMPI/CMakeLists.txt index 5b114927..d722fac9 100644 --- a/examples/laplaceMPI/CMakeLists.txt +++ b/examples/laplaceMPI/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.1.3) +cmake_minimum_required(VERSION 3.5) if (KOKKOS) #find_package(Kokkos REQUIRED) #new diff --git a/examples/mesh_decomp/CMakeLists.txt b/examples/mesh_decomp/CMakeLists.txt index b5ea83ca..6c8901da 100644 --- a/examples/mesh_decomp/CMakeLists.txt +++ b/examples/mesh_decomp/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.1.3) +cmake_minimum_required(VERSION 3.5) # Find MPI find_package(MPI REQUIRED) diff --git a/examples/phaseField/srcKokkosVerbose/CMakeLists.txt b/examples/phaseField/srcKokkosVerbose/CMakeLists.txt index 0da1896c..4f473fd7 100644 --- a/examples/phaseField/srcKokkosVerbose/CMakeLists.txt +++ b/examples/phaseField/srcKokkosVerbose/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.1.3) +cmake_minimum_required(VERSION 3.5) if (KOKKOS) diff --git a/examples/phaseField/srcMacros/CMakeLists.txt b/examples/phaseField/srcMacros/CMakeLists.txt index 0da1896c..4f473fd7 100644 --- a/examples/phaseField/srcMacros/CMakeLists.txt +++ b/examples/phaseField/srcMacros/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.1.3) +cmake_minimum_required(VERSION 3.5) if (KOKKOS) diff --git a/examples/phaseFieldMPI/CMakeLists.txt b/examples/phaseFieldMPI/CMakeLists.txt index 3650430a..4b8c6961 100644 --- a/examples/phaseFieldMPI/CMakeLists.txt +++ b/examples/phaseFieldMPI/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.1.3) +cmake_minimum_required(VERSION 3.5) #project (phasefield_mpi) diff --git a/examples/sparsetests/CMakeLists.txt b/examples/sparsetests/CMakeLists.txt index b8e3164d..a0f4c506 100644 --- a/examples/sparsetests/CMakeLists.txt +++ b/examples/sparsetests/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.1.3) +cmake_minimum_required(VERSION 3.5) if (KOKKOS) #find_package(Kokkos REQUIRED) diff --git a/examples/test_rocm/CMakeLists.txt b/examples/test_rocm/CMakeLists.txt index 31c4c2e2..564bb7e3 100644 --- a/examples/test_rocm/CMakeLists.txt +++ b/examples/test_rocm/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.1.3) +cmake_minimum_required(VERSION 3.5) #project (test_rocm) diff --git a/examples/virtualFcnKokkos/CMakeLists.txt b/examples/virtualFcnKokkos/CMakeLists.txt index b0673270..89f72fab 100644 --- a/examples/virtualFcnKokkos/CMakeLists.txt +++ b/examples/virtualFcnKokkos/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.1.3) +cmake_minimum_required(VERSION 3.5) #project (virttestkokkos) diff --git a/examples/virtualFcnMATAR/CMakeLists.txt b/examples/virtualFcnMATAR/CMakeLists.txt index 4e232051..22873a82 100644 --- a/examples/virtualFcnMATAR/CMakeLists.txt +++ b/examples/virtualFcnMATAR/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.1.3) +cmake_minimum_required(VERSION 3.5) find_package(Matar REQUIRED) diff --git a/examples/watt-graph/CMakeLists.txt b/examples/watt-graph/CMakeLists.txt index 9db93716..3061157a 100644 --- a/examples/watt-graph/CMakeLists.txt +++ b/examples/watt-graph/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.1.3) +cmake_minimum_required(VERSION 3.5) if (NOT KOKKOS) diff --git a/test/test_cases/CMakeLists.txt b/test/test_cases/CMakeLists.txt index 01cc23c0..a0e07edd 100644 --- a/test/test_cases/CMakeLists.txt +++ b/test/test_cases/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.1.3) +cmake_minimum_required(VERSION 3.5) # Find all test files in the current directory except test_main.cpp file(GLOB TEST_SOURCES "test_*.cpp") From e643d947dea723dd6be205e51c117c9c33ba57c1 Mon Sep 17 00:00:00 2001 From: Jacob Moore Date: Tue, 25 Nov 2025 15:21:36 -0600 Subject: [PATCH 52/52] ENH: Missing cmake update --- test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 8f7fa4c2..e6c2bfaf 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.1.3) +cmake_minimum_required(VERSION 3.5) project (matartest)